Coverage for src/wiktextract/extractor/th/linkage.py: 62%
181 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from itertools import count
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Linkage, WordEntry
14from .section_titles import LINKAGE_SECTIONS
15from .tags import translate_raw_tags
18def extract_linkage_section(
19 wxr: WiktextractContext,
20 word_entry: WordEntry,
21 level_node: LevelNode,
22 linkage_type: str,
23 source: str = "",
24 sense: str = "",
25) -> None:
26 for node in level_node.children:
27 if isinstance(node, TemplateNode) and node.template_name.startswith(
28 "col"
29 ):
30 extract_col_template(
31 wxr, word_entry, node, linkage_type, source, sense
32 )
33 elif isinstance(node, TemplateNode) and node.template_name == "ws":
34 extract_ws_template(
35 wxr, word_entry, node, linkage_type, source, sense
36 )
37 elif isinstance(node, TemplateNode) and node.template_name == "zh-dial": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 extract_zh_dial_template(wxr, word_entry, node, linkage_type, sense)
39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
40 for list_item in node.find_child(NodeKind.LIST_ITEM):
41 extract_linkage_list_item(
42 wxr, word_entry, list_item, linkage_type, source, sense
43 )
46def extract_col_template(
47 wxr: WiktextractContext,
48 word_entry: WordEntry,
49 t_node: TemplateNode,
50 linkage_type: str,
51 source: str,
52 sense: str,
53) -> None:
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(t_node), expand_all=True
56 )
57 for li_tag in expanded_node.find_html_recursively("li"):
58 l_data = []
59 for span_tag in li_tag.find_html("span"):
60 span_class = span_tag.attrs.get("class", "")
61 if "Latn" in span_class:
62 for data in l_data:
63 data.roman = clean_node(wxr, None, span_tag)
64 elif "lang" in span_tag.attrs:
65 word = clean_node(wxr, None, span_tag)
66 if word != "": 66 ↛ 59line 66 didn't jump to line 59 because the condition on line 66 was always true
67 l_data.append(
68 Linkage(word=word, source=source, sense=sense)
69 )
70 if span_class == "Hant":
71 l_data[-1].tags.append("Traditional-Chinese")
72 elif span_class == "Hans":
73 l_data[-1].tags.append("Simplified-Chinese")
74 getattr(word_entry, linkage_type).extend(l_data)
77def extract_linkage_list_item(
78 wxr: WiktextractContext,
79 word_entry: WordEntry,
80 list_item: WikiNode,
81 linkage_type: str,
82 source: str,
83 sense: str,
84) -> None:
85 linkages = []
86 raw_tags = []
88 for index, node in enumerate(list_item.children):
89 if isinstance(node, TemplateNode) and node.template_name == "l":
90 l_data = Linkage(
91 word=clean_node(wxr, None, node.template_parameters.get(2, "")),
92 source=source,
93 sense=sense,
94 raw_tags=raw_tags,
95 )
96 if l_data.word != "": 96 ↛ 88line 96 didn't jump to line 88 because the condition on line 96 was always true
97 translate_raw_tags(l_data)
98 linkages.append(l_data)
99 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
100 for link_node in node.find_child(NodeKind.LINK):
101 link_str = clean_node(wxr, None, link_node)
102 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 102 ↛ 100line 102 didn't jump to line 100 because the condition on line 102 was always true
103 "อรรถาภิธาน:"
104 ):
105 extract_thesaurus_page(
106 wxr, word_entry, linkage_type, link_str, sense
107 )
108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
109 link_str = clean_node(wxr, None, node)
110 if link_str != "": 110 ↛ 88line 110 didn't jump to line 88 because the condition on line 110 was always true
111 l_data = Linkage(word=link_str, sense=sense, raw_tags=raw_tags)
112 translate_raw_tags(l_data)
113 linkages.append(l_data)
114 elif isinstance(node, str) and ("-" in node or "–" in node):
115 if "-" in node: 115 ↛ 117line 115 didn't jump to line 117 because the condition on line 115 was always true
116 sense = node[node.index("-") + 1 :]
117 elif "–" in node:
118 sense = node[node.index("–") + 1 :]
119 sense = clean_node(
120 wxr,
121 None,
122 [sense] + list_item.children[index + 1 :],
123 ).strip()
124 for l_data in linkages:
125 l_data.sense = sense
126 break
127 elif isinstance(node, TemplateNode) and node.template_name in [
128 "qualifier",
129 "q",
130 "qual",
131 "qf",
132 ]:
133 text = clean_node(wxr, None, node).strip("() ")
134 for raw_tag in text.split(","):
135 raw_tag = raw_tag.strip()
136 if raw_tag != "": 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true
137 raw_tags.append(raw_tag)
138 elif isinstance(node, TemplateNode) and node.template_name == "zh-l":
139 linkages.extend(extract_zh_l_template(wxr, node, sense, raw_tags))
141 getattr(word_entry, linkage_type).extend(linkages)
144def extract_thesaurus_page(
145 wxr: WiktextractContext,
146 word_entry: WordEntry,
147 linkage_type: str,
148 page_title: str,
149 sense: str,
150) -> None:
151 page = wxr.wtp.get_page(page_title, 110)
152 if page is None or page.body is None: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 return
154 root = wxr.wtp.parse(page.body)
155 for level2_node in root.find_child(NodeKind.LEVEL2):
156 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
157 "ภาษา"
158 )
159 if lang_name != word_entry.lang: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 continue
161 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
162 pos_title = clean_node(wxr, None, level3_node.largs)
163 if pos_title != word_entry.pos_title: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 continue
165 for linkage_level_node in level3_node.find_child_recursively(
166 LEVEL_KIND_FLAGS
167 ):
168 linkage_title = clean_node(wxr, None, linkage_level_node.largs)
169 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
170 continue
171 extract_linkage_section(
172 wxr,
173 word_entry,
174 linkage_level_node,
175 linkage_type,
176 source=page_title,
177 sense=sense,
178 )
181def extract_ws_template(
182 wxr: WiktextractContext,
183 word_entry: WordEntry,
184 t_node: TemplateNode,
185 linkage_type: str,
186 source: str,
187 sense: str,
188) -> None:
189 word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
190 if word != "": 190 ↛ exitline 190 didn't return from function 'extract_ws_template' because the condition on line 190 was always true
191 l_data = Linkage(word=word, source=source, sense=sense)
192 getattr(word_entry, linkage_type).append(l_data)
195LINKAGE_TEMPLATES = {
196 "syn": "synonyms",
197 "synonyms": "synonyms",
198 "synsee": "synonyms",
199 "ant": "antonyms",
200 "antonyms": "antonyms",
201 "cot": "coordinate_terms",
202 "coordinate terms": "coordinate_terms",
203 "hyper": "hypernyms",
204 "hypernyms": "hypernyms",
205 "hypo": "hyponyms",
206 "hyponyms": "hyponyms",
207}
210def extract_syn_template(
211 wxr: WiktextractContext,
212 word_entry: WordEntry,
213 t_node: TemplateNode,
214 linkage_type: str,
215) -> None:
216 sense = " ".join(word_entry.senses[-1].glosses)
217 for arg_name in count(2): 217 ↛ exitline 217 didn't return from function 'extract_syn_template' because the loop on line 217 didn't complete
218 if arg_name not in t_node.template_parameters:
219 break
220 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
221 if arg_value.startswith("อรรถาภิธาน:"): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 extract_thesaurus_page(
223 wxr, word_entry, linkage_type, arg_value, sense
224 )
225 elif arg_value != "": 225 ↛ 217line 225 didn't jump to line 217 because the condition on line 225 was always true
226 getattr(word_entry, linkage_type).append(
227 Linkage(word=arg_value, sense=sense)
228 )
231def extract_zh_dial_template(
232 wxr: WiktextractContext,
233 word_entry: WordEntry,
234 t_node: TemplateNode,
235 linkage_type: str,
236 sense: str,
237):
238 from .sound import split_zh_pron_raw_tag
240 linkage_list = []
241 expanded_node = wxr.wtp.parse(
242 wxr.wtp.node_to_wikitext(t_node), expand_all=True
243 )
244 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):
245 is_note_row = False
246 note_tags = {}
247 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
248 for cell_node in row_node.find_child(
249 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
250 ):
251 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
252 is_note_row = clean_node(wxr, None, cell_node) == "หมายเหตุ"
253 elif is_note_row:
254 for note_str in clean_node(wxr, None, cell_node).split(";"):
255 if "-" in note_str:
256 note_symbol, note = note_str.split("-", maxsplit=1)
257 note_symbol = note_symbol.strip()
258 note = note.strip()
259 if note_symbol != "" and note != "":
260 note_tags[note_symbol] = note
261 lang_tags = []
262 region_tags = []
263 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
264 if not row_node.contain_node(NodeKind.TABLE_CELL):
265 continue # skip header row
266 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
267 lang_tags = split_zh_pron_raw_tag(
268 clean_node(wxr, None, header_node)
269 )
270 if lang_tags == ["หมายเหตุ"]: # skip last note row
271 continue
272 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
273 for link_node in cell_node.find_child(NodeKind.LINK):
274 region_tags = split_zh_pron_raw_tag(
275 clean_node(wxr, None, link_node)
276 )
277 for span_tag in cell_node.find_html("span"):
278 span_text = clean_node(wxr, None, span_tag)
279 if span_text == "":
280 continue
281 if (
282 span_tag.attrs.get("lang", "") == "zh"
283 and span_text != wxr.wtp.title
284 ):
285 l_data = Linkage(word=span_text, sense=sense)
286 if len(lang_tags) > 0:
287 l_data.raw_tags.extend(lang_tags)
288 if len(region_tags) > 0:
289 l_data.raw_tags.extend(region_tags)
290 translate_raw_tags(l_data)
291 linkage_list.append(l_data)
292 elif (
293 span_tag.attrs.get("style", "") == "font-size:60%"
294 and len(linkage_list) > 0
295 ):
296 for note_symbol in span_text.split(","):
297 note_symbol = note_symbol.strip()
298 raw_tag = note_symbol
299 if note_symbol in note_tags:
300 raw_tag = note_tags[note_symbol]
301 if raw_tag != "":
302 linkage_list[-1].raw_tags.append(raw_tag)
303 translate_raw_tags(linkage_list[-1])
305 getattr(word_entry, linkage_type).extend(linkage_list)
308def extract_zh_l_template(
309 wxr: WiktextractContext,
310 t_node: TemplateNode,
311 sense: str,
312 raw_tags: list[str],
313) -> list[Linkage]:
314 l_list = []
315 expanded_node = wxr.wtp.parse(
316 wxr.wtp.node_to_wikitext(t_node), expand_all=True
317 )
318 roman = ""
319 new_sense = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
320 if new_sense != "": 320 ↛ 322line 320 didn't jump to line 322 because the condition on line 320 was always true
321 sense = new_sense
322 for i_tag in expanded_node.find_html_recursively(
323 "span", attr_name="class", attr_value="Latn"
324 ):
325 roman = clean_node(wxr, None, i_tag)
326 for span_tag in expanded_node.find_html(
327 "span", attr_name="lang", attr_value="zh"
328 ):
329 linkage_data = Linkage(
330 sense=sense,
331 raw_tags=raw_tags,
332 roman=roman,
333 word=clean_node(wxr, None, span_tag),
334 )
335 lang_attr = span_tag.attrs.get("lang", "")
336 if lang_attr == "zh-Hant":
337 linkage_data.tags.append("Traditional-Chinese")
338 elif lang_attr == "zh-Hans":
339 linkage_data.tags.append("Simplified-Chinese")
340 if linkage_data.word not in ["/", ""]:
341 translate_raw_tags(linkage_data)
342 l_list.append(linkage_data)
344 return l_list