Coverage for src/wiktextract/extractor/ms/pos.py: 95%
103 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .example import extract_example_list_item
6from .models import AltForm, Form, Sense, WordEntry
7from .section_titles import POS_DATA
8from .tags import translate_raw_tags
10POS_HEADER_TEMPLATE_SUFFIXES = (
11 "-ks",
12 "-adj",
13 "-kn",
14 "-noun",
15 "-kk",
16 "-verb",
17 "-kerja",
18 "-kgn",
19 "-pron",
20 "-kkt",
21 "-adv",
22 "-kp",
23 "-sendi",
24 "-prep",
25 "-seru",
26 "-kanji",
27 "-hanzi",
28 "-hanja",
29 "-conj",
30 "-hantu",
31)
33FORM_OF_TEMPLATES = {"ja-perumian", "jamak", "alt case"}
34ALT_OF_TEMPLATES = {"alt case", "alternative case form of"}
37def extract_pos_section(
38 wxr: WiktextractContext,
39 page_data: list[WordEntry],
40 base_data: WordEntry,
41 level_node: LevelNode,
42 pos_title: str,
43) -> None:
44 page_data.append(base_data.model_copy(deep=True))
45 page_data[-1].pos_title = pos_title
46 pos_data = POS_DATA[pos_title.lower()]
47 page_data[-1].pos = pos_data["pos"]
48 page_data[-1].tags.extend(pos_data.get("tags", []))
50 gloss_list_index = len(level_node.children)
51 for index, node in enumerate(level_node.children):
52 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
53 for list_item in node.find_child(NodeKind.LIST_ITEM):
54 if node.sarg.startswith("#") and node.sarg.endswith("#"):
55 extract_gloss_list_item(wxr, page_data[-1], list_item)
56 if index < gloss_list_index:
57 gloss_list_index = index
58 elif isinstance(node, TemplateNode) and (
59 node.template_name.endswith(POS_HEADER_TEMPLATE_SUFFIXES)
60 or node.template_name in ["inti", "head", "Han char"]
61 ):
62 extract_pos_header_template(wxr, page_data, base_data, node)
64 if len(page_data[-1].senses) == 0:
65 page_data.pop()
68def extract_gloss_list_item(
69 wxr: WiktextractContext,
70 word_entry: WordEntry,
71 list_item: WikiNode,
72 parent_sense: Sense | None = None,
73) -> None:
74 sense = (
75 parent_sense.model_copy(deep=True)
76 if parent_sense is not None
77 else Sense()
78 )
79 gloss_nodes = []
80 for node in list_item.children:
81 if isinstance(node, TemplateNode) and node.template_name in [
82 "label",
83 "lb",
84 "konteks",
85 "context",
86 "konteks 1",
87 "context 2",
88 ]:
89 extract_label_template(wxr, sense, node)
90 elif isinstance(node, TemplateNode) and (
91 node.template_name.endswith(" of")
92 or node.template_name in FORM_OF_TEMPLATES
93 or node.template_name in ALT_OF_TEMPLATES
94 ):
95 extract_form_of_template(wxr, sense, node)
96 gloss_nodes.append(node)
97 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
98 gloss_nodes.append(node)
99 gloss_str = clean_node(wxr, sense, gloss_nodes)
100 if gloss_str != "": 100 ↛ 102line 100 didn't jump to line 102 because the condition on line 100 was always true
101 sense.glosses.append(gloss_str)
102 if len(sense.glosses) > 0: 102 ↛ 106line 102 didn't jump to line 106 because the condition on line 102 was always true
103 translate_raw_tags(sense)
104 word_entry.senses.append(sense)
106 for child_list in list_item.find_child(NodeKind.LIST):
107 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
108 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
109 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
110 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 110 ↛ 106line 110 didn't jump to line 106 because the condition on line 110 was always true
111 (":", "*")
112 ):
113 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
114 extract_example_list_item(wxr, word_entry, sense, e_list_item)
117def extract_pos_header_template(
118 wxr: WiktextractContext,
119 page_data: list[WordEntry],
120 base_data: WordEntry,
121 t_node: TemplateNode,
122) -> None:
123 cats = {}
124 expanded_template = wxr.wtp.parse(
125 wxr.wtp.node_to_wikitext(t_node), expand_all=True
126 )
127 for link_node in expanded_template.find_child(NodeKind.LINK):
128 clean_node(wxr, cats, link_node)
129 pos_type = "unknown"
130 pos_tags = []
131 for cat in cats.get("categories", []): 131 ↛ 139line 131 didn't jump to line 139 because the loop on line 131 didn't complete
132 for pos_title, pos_data in POS_DATA.items():
133 if cat.lower().startswith(pos_title):
134 pos_type = pos_data["pos"]
135 pos_tags = pos_data.get("tags", [])
136 break
137 if pos_type != "unknown":
138 break
139 if page_data[-1].pos_title == "Takrifan" and page_data[-1].pos != "unknown":
140 page_data.append(base_data.model_copy(deep=True))
141 page_data[-1].pos = pos_type
142 page_data[-1].pos_title = "Takrifan"
143 page_data[-1].tags.extend(pos_tags)
144 if page_data[-1].pos == "unknown":
145 page_data[-1].pos = pos_type
146 page_data[-1].tags.extend(pos_tags)
147 page_data[-1].categories.extend(cats.get("categories", []))
149 raw_tag = ""
150 for node in expanded_template.find_child_recursively(NodeKind.HTML):
151 match node.tag:
152 case "i":
153 raw_tag = clean_node(wxr, None, node)
154 case "b":
155 form = Form(form=clean_node(wxr, None, node))
156 if raw_tag != "": 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true
157 form.raw_tags.append(raw_tag)
158 if form.form != "": 158 ↛ 150line 158 didn't jump to line 150 because the condition on line 158 was always true
159 translate_raw_tags(form)
160 page_data[-1].forms.append(form)
163def extract_label_template(
164 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
165) -> None:
166 text = clean_node(wxr, sense, t_node).strip("() ")
167 for raw_tag in text.split(","):
168 raw_tag = raw_tag.strip()
169 if raw_tag != "": 169 ↛ 167line 169 didn't jump to line 167 because the condition on line 169 was always true
170 sense.raw_tags.append(raw_tag)
173def extract_form_of_template(
174 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
175) -> None:
176 expanded_template = wxr.wtp.parse(
177 wxr.wtp.node_to_wikitext(t_node), expand_all=True
178 )
179 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):
180 if html_tag.tag == "i" and "mention" in html_tag.attrs.get("class", ""):
181 word = clean_node(wxr, None, html_tag)
182 if word != "": 182 ↛ 179line 182 didn't jump to line 179 because the condition on line 182 was always true
183 if t_node.template_name in ALT_OF_TEMPLATES:
184 sense.alt_of.append(AltForm(word=word))
185 else:
186 sense.form_of.append(AltForm(word=word))