Coverage for src/wiktextract/extractor/id/pos.py: 82%
133 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 HTMLNode,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from .example import extract_example_list_item
13from .models import AltForm, Attestation, Example, Form, Sense, WordEntry
14from .section_titles import POS_DATA
15from .tags import translate_raw_tags
18def extract_pos_section(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 base_data: WordEntry,
22 level_node: LevelNode,
23 pos_title: str,
24) -> None:
25 page_data.append(base_data.model_copy(deep=True))
26 page_data[-1].pos_title = pos_title
27 pos_data = POS_DATA[pos_title]
28 page_data[-1].pos = pos_data["pos"]
29 page_data[-1].tags.extend(pos_data.get("tags", []))
31 gloss_list_index = len(level_node.children)
32 for index, node in enumerate(level_node.children):
33 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
34 for list_item in node.find_child(NodeKind.LIST_ITEM):
35 if node.sarg.startswith("#") and node.sarg.endswith("#"): 35 ↛ 34line 35 didn't jump to line 34 because the condition on line 35 was always true
36 extract_gloss_list_item(wxr, page_data[-1], list_item)
37 if index < gloss_list_index: 37 ↛ 34line 37 didn't jump to line 34 because the condition on line 37 was always true
38 gloss_list_index = index
39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 51line 39 didn't jump to line 51 because the condition on line 39 was never true
40 "lihat 2",
41 "lihat ulang",
42 "lihat v",
43 "lihat2 a",
44 "lihat2 adv",
45 "lihat v ber2",
46 "lihat n",
47 "lihat 2 an",
48 "lihat v ter2",
49 "lihat2 v",
50 ]:
51 extract_lihat_2_template(wxr, page_data[-1], node)
53 process_pos_header_nodes(
54 wxr, page_data[-1], level_node.children[:gloss_list_index]
55 )
58def extract_gloss_list_item(
59 wxr: WiktextractContext,
60 word_entry: WordEntry,
61 list_item: WikiNode,
62 parent_sense: Sense | None = None,
63) -> None:
64 sense = (
65 parent_sense.model_copy(deep=True)
66 if parent_sense is not None
67 else Sense()
68 )
69 gloss_nodes = []
70 after_br_tag = False
71 for node in list_item.children:
72 if isinstance(node, TemplateNode) and node.template_name.startswith(
73 "variasi"
74 ):
75 extract_variasi_template(wxr, sense, node)
76 elif isinstance(node, TemplateNode) and node.template_name == "defdate":
77 extract_defdate_template(wxr, sense, node)
78 elif isinstance(node, TemplateNode):
79 expanded = clean_node(wxr, sense, node)
80 if expanded.startswith("(") and expanded.strip().endswith( 80 ↛ 88line 80 didn't jump to line 88 because the condition on line 80 was always true
81 (")", ") ·")
82 ):
83 for raw_tag in expanded.split("·"):
84 raw_tag = raw_tag.strip("() ")
85 if raw_tag != "":
86 sense.raw_tags.append(raw_tag)
87 else:
88 gloss_nodes.append(expanded)
89 elif (
90 isinstance(node, HTMLNode) and node.tag == "br" and not after_br_tag
91 ):
92 after_br_tag = True
93 elif (
94 isinstance(node, WikiNode)
95 and node.kind == NodeKind.ITALIC
96 and after_br_tag
97 ):
98 e_str = clean_node(wxr, None, node)
99 if e_str != "": 99 ↛ 71line 99 didn't jump to line 71 because the condition on line 99 was always true
100 sense.examples.append(Example(text=e_str))
101 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
102 gloss_nodes.append(node)
104 gloss_str = clean_node(wxr, sense, gloss_nodes)
105 if gloss_str != "":
106 sense.glosses.append(gloss_str)
107 if gloss_str.startswith("bentuk "):
108 find_form_of_link(wxr, sense, gloss_nodes)
110 if len(sense.glosses) > 0: 110 ↛ 114line 110 didn't jump to line 114 because the condition on line 110 was always true
111 translate_raw_tags(sense)
112 word_entry.senses.append(sense)
114 for child_list in list_item.find_child(NodeKind.LIST):
115 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
116 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
117 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
118 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 118 ↛ 114line 118 didn't jump to line 114 because the condition on line 118 was always true
119 (":", "*")
120 ):
121 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
122 extract_example_list_item(wxr, word_entry, sense, e_list_item)
125def extract_lihat_2_template(
126 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
127) -> None:
128 # https://id.wiktionary.org/wiki/Templat:lihat_2
129 expanded_template = wxr.wtp.parse(
130 wxr.wtp.node_to_wikitext(t_node), expand_all=True
131 )
132 for list_node in expanded_template.find_child(NodeKind.LIST):
133 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
134 sense = Sense()
135 gloss_str = clean_node(wxr, sense, list_item.children)
136 if "⇢" in gloss_str:
137 sense.glosses.append(
138 gloss_str[gloss_str.index("⇢") + 1 :].strip()
139 )
140 if ")" in gloss_str:
141 sense.raw_tags.append(
142 gloss_str[: gloss_str.index(")")].strip("( ")
143 )
144 if len(sense.glosses) > 0:
145 word_entry.senses.append(sense)
148def process_pos_header_nodes(
149 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
150) -> None:
151 raw_tag = ""
152 after_bold_node = False
153 for node in nodes:
154 if isinstance(node, WikiNode):
155 if node.kind == NodeKind.BOLD:
156 after_bold_node = True
157 elif (
158 node.kind == NodeKind.LINK
159 and after_bold_node
160 and clean_node(wxr, None, node) != ""
161 and len(node.largs) > 0
162 ):
163 word = clean_node(wxr, None, node.largs[0])
164 if word != "": 164 ↛ 153line 164 didn't jump to line 153 because the condition on line 164 was always true
165 form = Form(form=word)
166 if raw_tag != "": 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true
167 form.raw_tags.append(raw_tag)
168 translate_raw_tags(form)
169 word_entry.forms.append(form)
170 elif isinstance(node, str) and node.strip().endswith(":"):
171 raw_tag = node.strip("():;, ")
174def extract_variasi_template(
175 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
176) -> None:
177 for index in range(1, 4):
178 word = clean_node(wxr, None, t_node.template_parameters.get(index, ""))
179 if word != "":
180 sense.alt_of.append(AltForm(word=word))
181 gloss = clean_node(wxr, sense, t_node)
182 if gloss != "": 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was always true
183 sense.glosses.append(gloss)
184 sense.tags.append("alt-of")
187def find_form_of_link(
188 wxr: WiktextractContext, sense: Sense, gloss_nodes: list[WikiNode | str]
189) -> None:
190 # pre-expanded "nomina *", "imbuhan *", "ulang *", "verba *" templates
191 form_of = ""
192 for node in gloss_nodes:
193 if isinstance(node, WikiNode): 193 ↛ 192line 193 didn't jump to line 192 because the condition on line 193 was always true
194 if node.kind == NodeKind.LINK: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 form_of = clean_node(wxr, None, node)
196 elif node.kind == NodeKind.ITALIC: 196 ↛ 192line 196 didn't jump to line 192 because the condition on line 196 was always true
197 for link in node.find_child(NodeKind.LINK):
198 form_of = clean_node(wxr, None, link)
200 if form_of != "": 200 ↛ exitline 200 didn't return from function 'find_form_of_link' because the condition on line 200 was always true
201 sense.form_of.append(AltForm(word=form_of))
202 sense.tags.append("form-of")
205def extract_usage_section(
206 wxr: WiktextractContext, word_entry: WordEntry, section_node: LevelNode
207) -> None:
208 non_list_nodes = []
209 for node in section_node.children:
210 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
211 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):
212 note = clean_node(
213 wxr,
214 word_entry,
215 list(list_item.invert_find_child(NodeKind.LIST)),
216 )
217 if note != "": 217 ↛ 211line 217 didn't jump to line 211 because the condition on line 217 was always true
218 word_entry.notes.append(note)
219 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 break
221 else:
222 non_list_nodes.append(node)
224 note = clean_node(wxr, word_entry, non_list_nodes)
225 if note != "": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true
226 word_entry.notes.append(note)
229def extract_defdate_template(
230 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
231):
232 expanded_node = wxr.wtp.parse(
233 wxr.wtp.node_to_wikitext(t_node), expand_all=True
234 )
235 date = clean_node(wxr, None, expanded_node).strip("[]")
236 if date != "": 236 ↛ exitline 236 didn't return from function 'extract_defdate_template' because the condition on line 236 was always true
237 sense.attestations.append(Attestation(date=date))