Coverage for src/wiktextract/extractor/ms/pos.py: 95%
110 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .example import extract_example_list_item
6from .models import AltForm, Attestation, Form, Sense, WordEntry
7from .section_titles import POS_DATA
8from .tags import translate_raw_tags
10POS_HEADER_TEMPLATE_SUFFIXES = (
11 "-ks",
12 "-adj",
13 "-kn",
14 "-noun",
15 "-kk",
16 "-verb",
17 "-kerja",
18 "-kgn",
19 "-pron",
20 "-kkt",
21 "-adv",
22 "-kp",
23 "-sendi",
24 "-prep",
25 "-seru",
26 "-kanji",
27 "-hanzi",
28 "-hanja",
29 "-conj",
30 "-hantu",
31)
33FORM_OF_TEMPLATES = {"ja-perumian", "jamak", "alt case"}
34ALT_OF_TEMPLATES = {"alt case", "alternative case form of"}
37def extract_pos_section(
38 wxr: WiktextractContext,
39 page_data: list[WordEntry],
40 base_data: WordEntry,
41 level_node: LevelNode,
42 pos_title: str,
43) -> None:
44 page_data.append(base_data.model_copy(deep=True))
45 page_data[-1].pos_title = pos_title
46 pos_data = POS_DATA[pos_title.lower()]
47 page_data[-1].pos = pos_data["pos"]
48 page_data[-1].tags.extend(pos_data.get("tags", []))
50 gloss_list_index = len(level_node.children)
51 for index, node in enumerate(level_node.children):
52 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
53 for list_item in node.find_child(NodeKind.LIST_ITEM):
54 if node.sarg.startswith("#") and node.sarg.endswith("#"):
55 extract_gloss_list_item(wxr, page_data[-1], list_item)
56 if index < gloss_list_index:
57 gloss_list_index = index
58 elif isinstance(node, TemplateNode) and (
59 node.template_name.endswith(POS_HEADER_TEMPLATE_SUFFIXES)
60 or node.template_name in ["inti", "head", "Han char"]
61 ):
62 extract_pos_header_template(wxr, page_data, base_data, node)
64 if len(page_data[-1].senses) == 0:
65 page_data.pop()
68def extract_gloss_list_item(
69 wxr: WiktextractContext,
70 word_entry: WordEntry,
71 list_item: WikiNode,
72 parent_sense: Sense | None = None,
73) -> None:
74 sense = (
75 parent_sense.model_copy(deep=True)
76 if parent_sense is not None
77 else Sense()
78 )
79 gloss_nodes = []
80 for node in list_item.children:
81 if isinstance(node, TemplateNode) and node.template_name in [
82 "label",
83 "lb",
84 "konteks",
85 "context",
86 "konteks 1",
87 "context 2",
88 ]:
89 extract_label_template(wxr, sense, node)
90 elif isinstance(node, TemplateNode) and node.template_name == "defdate":
91 extract_defdate_template(wxr, sense, node)
92 elif isinstance(node, TemplateNode) and (
93 node.template_name.endswith(" of")
94 or node.template_name in FORM_OF_TEMPLATES
95 or node.template_name in ALT_OF_TEMPLATES
96 ):
97 extract_form_of_template(wxr, sense, node)
98 gloss_nodes.append(node)
99 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
100 gloss_nodes.append(node)
101 gloss_str = clean_node(wxr, sense, gloss_nodes)
102 if gloss_str != "": 102 ↛ 104line 102 didn't jump to line 104 because the condition on line 102 was always true
103 sense.glosses.append(gloss_str)
104 if len(sense.glosses) > 0: 104 ↛ 108line 104 didn't jump to line 108 because the condition on line 104 was always true
105 translate_raw_tags(sense)
106 word_entry.senses.append(sense)
108 for child_list in list_item.find_child(NodeKind.LIST):
109 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
110 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
111 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
112 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 112 ↛ 108line 112 didn't jump to line 108 because the condition on line 112 was always true
113 (":", "*")
114 ):
115 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
116 extract_example_list_item(wxr, word_entry, sense, e_list_item)
119def extract_pos_header_template(
120 wxr: WiktextractContext,
121 page_data: list[WordEntry],
122 base_data: WordEntry,
123 t_node: TemplateNode,
124) -> None:
125 cats = {}
126 expanded_template = wxr.wtp.parse(
127 wxr.wtp.node_to_wikitext(t_node), expand_all=True
128 )
129 for link_node in expanded_template.find_child(NodeKind.LINK):
130 clean_node(wxr, cats, link_node)
131 pos_type = "unknown"
132 pos_tags = []
133 for cat in cats.get("categories", []): 133 ↛ 141line 133 didn't jump to line 141 because the loop on line 133 didn't complete
134 for pos_title, pos_data in POS_DATA.items():
135 if cat.lower().startswith(pos_title):
136 pos_type = pos_data["pos"]
137 pos_tags = pos_data.get("tags", [])
138 break
139 if pos_type != "unknown":
140 break
141 if page_data[-1].pos_title == "Takrifan" and page_data[-1].pos != "unknown":
142 page_data.append(base_data.model_copy(deep=True))
143 page_data[-1].pos = pos_type
144 page_data[-1].pos_title = "Takrifan"
145 page_data[-1].tags.extend(pos_tags)
146 if page_data[-1].pos == "unknown":
147 page_data[-1].pos = pos_type
148 page_data[-1].tags.extend(pos_tags)
149 page_data[-1].categories.extend(cats.get("categories", []))
151 raw_tag = ""
152 for node in expanded_template.find_child_recursively(NodeKind.HTML):
153 match node.tag:
154 case "i":
155 raw_tag = clean_node(wxr, None, node)
156 case "b":
157 form = Form(form=clean_node(wxr, None, node))
158 if raw_tag != "": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true
159 form.raw_tags.append(raw_tag)
160 if form.form != "": 160 ↛ 152line 160 didn't jump to line 152 because the condition on line 160 was always true
161 translate_raw_tags(form)
162 page_data[-1].forms.append(form)
165def extract_label_template(
166 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
167) -> None:
168 text = clean_node(wxr, sense, t_node).strip("() ")
169 for raw_tag in text.split(","):
170 raw_tag = raw_tag.strip()
171 if raw_tag != "": 171 ↛ 169line 171 didn't jump to line 169 because the condition on line 171 was always true
172 sense.raw_tags.append(raw_tag)
175def extract_form_of_template(
176 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
177) -> None:
178 expanded_template = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(t_node), expand_all=True
180 )
181 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):
182 if html_tag.tag == "i" and "mention" in html_tag.attrs.get("class", ""):
183 word = clean_node(wxr, None, html_tag)
184 if word != "": 184 ↛ 181line 184 didn't jump to line 181 because the condition on line 184 was always true
185 if t_node.template_name in ALT_OF_TEMPLATES:
186 sense.alt_of.append(AltForm(word=word))
187 else:
188 sense.form_of.append(AltForm(word=word))
191def extract_defdate_template(
192 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
193):
194 expanded_node = wxr.wtp.parse(
195 wxr.wtp.node_to_wikitext(t_node), expand_all=True
196 )
197 date = clean_node(wxr, None, expanded_node).strip("[]")
198 if date != "": 198 ↛ exitline 198 didn't return from function 'extract_defdate_template' because the condition on line 198 was always true
199 sense.attestations.append(Attestation(date=date))