Coverage for src/wiktextract/extractor/id/pos.py: 81%
126 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 HTMLNode,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from .example import extract_example_list_item
13from .models import AltForm, Example, Form, Sense, WordEntry
14from .section_titles import POS_DATA
15from .tags import translate_raw_tags
18def extract_pos_section(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 base_data: WordEntry,
22 level_node: LevelNode,
23 pos_title: str,
24) -> None:
25 page_data.append(base_data.model_copy(deep=True))
26 page_data[-1].pos_title = pos_title
27 pos_data = POS_DATA[pos_title]
28 page_data[-1].pos = pos_data["pos"]
29 page_data[-1].tags.extend(pos_data.get("tags", []))
31 gloss_list_index = len(level_node.children)
32 for index, node in enumerate(level_node.children):
33 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
34 for list_item in node.find_child(NodeKind.LIST_ITEM):
35 if node.sarg.startswith("#") and node.sarg.endswith("#"): 35 ↛ 34line 35 didn't jump to line 34 because the condition on line 35 was always true
36 extract_gloss_list_item(wxr, page_data[-1], list_item)
37 if index < gloss_list_index: 37 ↛ 34line 37 didn't jump to line 34 because the condition on line 37 was always true
38 gloss_list_index = index
39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 51line 39 didn't jump to line 51 because the condition on line 39 was never true
40 "lihat 2",
41 "lihat ulang",
42 "lihat v",
43 "lihat2 a",
44 "lihat2 adv",
45 "lihat v ber2",
46 "lihat n",
47 "lihat 2 an",
48 "lihat v ter2",
49 "lihat2 v",
50 ]:
51 extract_lihat_2_template(wxr, page_data[-1], node)
53 process_pos_header_nodes(
54 wxr, page_data[-1], level_node.children[:gloss_list_index]
55 )
58def extract_gloss_list_item(
59 wxr: WiktextractContext,
60 word_entry: WordEntry,
61 list_item: WikiNode,
62 parent_sense: Sense | None = None,
63) -> None:
64 sense = (
65 parent_sense.model_copy(deep=True)
66 if parent_sense is not None
67 else Sense()
68 )
69 gloss_nodes = []
70 after_br_tag = False
71 for node in list_item.children:
72 if isinstance(node, TemplateNode) and node.template_name.startswith(
73 "variasi"
74 ):
75 extract_variasi_template(wxr, sense, node)
76 elif isinstance(node, TemplateNode):
77 expanded = clean_node(wxr, sense, node)
78 if expanded.startswith("(") and expanded.strip().endswith( 78 ↛ 86line 78 didn't jump to line 86 because the condition on line 78 was always true
79 (")", ") ·")
80 ):
81 for raw_tag in expanded.split("·"):
82 raw_tag = raw_tag.strip("() ")
83 if raw_tag != "":
84 sense.raw_tags.append(raw_tag)
85 else:
86 gloss_nodes.append(expanded)
87 elif (
88 isinstance(node, HTMLNode) and node.tag == "br" and not after_br_tag
89 ):
90 after_br_tag = True
91 elif (
92 isinstance(node, WikiNode)
93 and node.kind == NodeKind.ITALIC
94 and after_br_tag
95 ):
96 e_str = clean_node(wxr, None, node)
97 if e_str != "": 97 ↛ 71line 97 didn't jump to line 71 because the condition on line 97 was always true
98 sense.examples.append(Example(text=e_str))
99 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
100 gloss_nodes.append(node)
102 gloss_str = clean_node(wxr, sense, gloss_nodes)
103 if gloss_str != "":
104 sense.glosses.append(gloss_str)
105 if gloss_str.startswith("bentuk "):
106 find_form_of_link(wxr, sense, gloss_nodes)
108 if len(sense.glosses) > 0: 108 ↛ 112line 108 didn't jump to line 112 because the condition on line 108 was always true
109 translate_raw_tags(sense)
110 word_entry.senses.append(sense)
112 for child_list in list_item.find_child(NodeKind.LIST):
113 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
115 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
116 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 116 ↛ 112line 116 didn't jump to line 112 because the condition on line 116 was always true
117 (":", "*")
118 ):
119 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
120 extract_example_list_item(wxr, word_entry, sense, e_list_item)
123def extract_lihat_2_template(
124 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
125) -> None:
126 # https://id.wiktionary.org/wiki/Templat:lihat_2
127 expanded_template = wxr.wtp.parse(
128 wxr.wtp.node_to_wikitext(t_node), expand_all=True
129 )
130 for list_node in expanded_template.find_child(NodeKind.LIST):
131 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
132 sense = Sense()
133 gloss_str = clean_node(wxr, sense, list_item.children)
134 if "⇢" in gloss_str:
135 sense.glosses.append(
136 gloss_str[gloss_str.index("⇢") + 1 :].strip()
137 )
138 if ")" in gloss_str:
139 sense.raw_tags.append(
140 gloss_str[: gloss_str.index(")")].strip("( ")
141 )
142 if len(sense.glosses) > 0:
143 word_entry.senses.append(sense)
146def process_pos_header_nodes(
147 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
148) -> None:
149 raw_tag = ""
150 after_bold_node = False
151 for node in nodes:
152 if isinstance(node, WikiNode):
153 if node.kind == NodeKind.BOLD:
154 after_bold_node = True
155 elif (
156 node.kind == NodeKind.LINK
157 and after_bold_node
158 and clean_node(wxr, None, node) != ""
159 and len(node.largs) > 0
160 ):
161 word = clean_node(wxr, None, node.largs[0])
162 if word != "": 162 ↛ 151line 162 didn't jump to line 151 because the condition on line 162 was always true
163 form = Form(form=word)
164 if raw_tag != "": 164 ↛ 167line 164 didn't jump to line 167 because the condition on line 164 was always true
165 form.raw_tags.append(raw_tag)
166 translate_raw_tags(form)
167 word_entry.forms.append(form)
168 elif isinstance(node, str) and node.strip().endswith(":"):
169 raw_tag = node.strip("():;, ")
172def extract_variasi_template(
173 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
174) -> None:
175 for index in range(1, 4):
176 word = clean_node(wxr, None, t_node.template_parameters.get(index, ""))
177 if word != "":
178 sense.alt_of.append(AltForm(word=word))
179 gloss = clean_node(wxr, sense, t_node)
180 if gloss != "": 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true
181 sense.glosses.append(gloss)
182 sense.tags.append("alt-of")
185def find_form_of_link(
186 wxr: WiktextractContext, sense: Sense, gloss_nodes: list[WikiNode | str]
187) -> None:
188 # pre-expanded "nomina *", "imbuhan *", "ulang *", "verba *" templates
189 form_of = ""
190 for node in gloss_nodes:
191 if isinstance(node, WikiNode): 191 ↛ 190line 191 didn't jump to line 190 because the condition on line 191 was always true
192 if node.kind == NodeKind.LINK: 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 form_of = clean_node(wxr, None, node)
194 elif node.kind == NodeKind.ITALIC: 194 ↛ 190line 194 didn't jump to line 190 because the condition on line 194 was always true
195 for link in node.find_child(NodeKind.LINK):
196 form_of = clean_node(wxr, None, link)
198 if form_of != "": 198 ↛ exitline 198 didn't return from function 'find_form_of_link' because the condition on line 198 was always true
199 sense.form_of.append(AltForm(word=form_of))
200 sense.tags.append("form-of")
203def extract_usage_section(
204 wxr: WiktextractContext, word_entry: WordEntry, section_node: LevelNode
205) -> None:
206 non_list_nodes = []
207 for node in section_node.children:
208 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
209 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):
210 note = clean_node(
211 wxr,
212 word_entry,
213 list(list_item.invert_find_child(NodeKind.LIST)),
214 )
215 if note != "": 215 ↛ 209line 215 didn't jump to line 209 because the condition on line 215 was always true
216 word_entry.notes.append(note)
217 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true
218 break
219 else:
220 non_list_nodes.append(node)
222 note = clean_node(wxr, word_entry, non_list_nodes)
223 if note != "": 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true
224 word_entry.notes.append(note)