Coverage for src/wiktextract/extractor/ja/pos.py: 92%
77 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from .example import extract_example_list_item
7from .header import extract_header_nodes
8from .models import AltForm, Sense, WordEntry
9from .section_titles import POS_DATA
10from .tags import translate_raw_tags
13def parse_pos_section(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 base_data: WordEntry,
17 level_node: LevelNode,
18 pos_title: str,
19) -> None:
20 page_data.append(base_data.model_copy(deep=True))
21 page_data[-1].pos_title = pos_title
22 pos_data = POS_DATA[pos_title]
23 page_data[-1].pos = pos_data["pos"]
24 page_data[-1].tags.extend(pos_data.get("tags", []))
26 gloss_list_start = 0
27 for list_index, list_node in level_node.find_child(NodeKind.LIST, True):
28 if not list_node.sarg.endswith("#"): # linkage list
29 continue
30 if gloss_list_start == 0: 30 ↛ 32line 30 didn't jump to line 32 because the condition on line 30 was always true
31 gloss_list_start = list_index
32 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
33 process_gloss_list_item(wxr, page_data[-1], list_item)
34 extract_header_nodes(
35 wxr, page_data[-1], level_node.children[:gloss_list_start]
36 )
37 if gloss_list_start == 0:
38 page_data.pop()
41def process_gloss_list_item(
42 wxr: WiktextractContext,
43 word_entry: WordEntry,
44 list_item_node: WikiNode,
45 parent_gloss: str = "",
46) -> None:
47 gloss_nodes = list(
48 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True)
49 )
50 sense_data = Sense()
51 find_form_of_data(wxr, word_entry, sense_data, list_item_node)
52 if len(parent_gloss) > 0:
53 sense_data.glosses.append(parent_gloss)
54 gloss_only_nodes = []
55 for gloss_node in gloss_nodes:
56 if isinstance(gloss_node, TemplateNode):
57 if gloss_node.template_name in ("context", "タグ"):
58 # https://ja.wiktionary.org/wiki/テンプレート:context
59 # https://ja.wiktionary.org/wiki/テンプレート:タグ
60 for raw_tag in (
61 clean_node(wxr, sense_data, gloss_node)
62 .strip("()")
63 .split(",")
64 ):
65 raw_tag = raw_tag.strip()
66 if len(raw_tag) > 0: 66 ↛ 60line 66 didn't jump to line 60 because the condition on line 66 was always true
67 sense_data.raw_tags.append(raw_tag)
68 elif gloss_node.template_name == "wikipedia-s":
69 expanded_text = clean_node(wxr, None, gloss_node)
70 gloss_only_nodes.append(
71 expanded_text.removesuffix("⁽ʷᵖ⁾").strip()
72 )
73 elif gloss_node.template_name == "wp": 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 continue
75 else:
76 gloss_only_nodes.append(gloss_node)
77 else:
78 gloss_only_nodes.append(gloss_node)
79 expanded_gloss = wxr.wtp.parse(
80 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True
81 )
82 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children)
83 gloss_text = clean_node(wxr, sense_data, no_ruby)
84 sense_data.ruby = ruby
85 if len(gloss_text) > 0: 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true
86 sense_data.glosses.append(gloss_text)
87 translate_raw_tags(sense_data)
88 word_entry.senses.append(sense_data)
90 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
91 if nest_gloss_list.sarg.endswith(("*", ":")):
92 for example_list_item in nest_gloss_list.find_child(
93 NodeKind.LIST_ITEM
94 ):
95 extract_example_list_item(
96 wxr, word_entry, sense_data, example_list_item
97 )
98 elif nest_gloss_list.sarg.endswith("#"): 98 ↛ 90line 98 didn't jump to line 90 because the condition on line 98 was always true
99 for nest_list_item in nest_gloss_list.find_child(
100 NodeKind.LIST_ITEM
101 ):
102 process_gloss_list_item(
103 wxr, word_entry, nest_list_item, gloss_text
104 )
107def find_form_of_data(
108 wxr: WiktextractContext,
109 word_entry: WordEntry,
110 sense: Sense,
111 list_item_node: WikiNode,
112) -> None:
113 for node in list_item_node.find_child(NodeKind.TEMPLATE):
114 if node.template_name.endswith(" of"):
115 expanded_node = wxr.wtp.parse(
116 wxr.wtp.node_to_wikitext(node), expand_all=True
117 )
118 for link_node in expanded_node.find_child_recursively( 118 ↛ 113line 118 didn't jump to line 113 because the loop on line 118 didn't complete
119 NodeKind.LINK
120 ):
121 form_of = clean_node(wxr, None, link_node)
122 if form_of != "": 122 ↛ 118line 122 didn't jump to line 118 because the condition on line 122 was always true
123 sense.form_of.append(AltForm(word=form_of))
124 break
125 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
126 for link_node in list_item_node.find_child(NodeKind.LINK): 126 ↛ exitline 126 didn't return from function 'find_form_of_data' because the loop on line 126 didn't complete
127 form_of = clean_node(wxr, None, link_node)
128 if form_of != "": 128 ↛ 126line 128 didn't jump to line 126 because the condition on line 128 was always true
129 sense.form_of.append(AltForm(word=form_of))
130 break