Coverage for src/wiktextract/extractor/th/linkage.py: 88%
92 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from itertools import count
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Linkage, WordEntry
14from .section_titles import LINKAGE_SECTIONS
17def extract_linkage_section(
18 wxr: WiktextractContext,
19 word_entry: WordEntry,
20 level_node: LevelNode,
21 linkage_type: str,
22 source: str = "",
23) -> None:
24 for node in level_node.children:
25 if isinstance(node, TemplateNode) and node.template_name.startswith(
26 "col"
27 ):
28 extract_col_template(wxr, word_entry, node, linkage_type, source)
29 elif isinstance(node, TemplateNode) and node.template_name == "ws":
30 extract_ws_template(wxr, word_entry, node, linkage_type, source)
31 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
32 for list_item in node.find_child(NodeKind.LIST_ITEM):
33 extract_linkage_list_item(
34 wxr, word_entry, list_item, linkage_type, source
35 )
38def extract_col_template(
39 wxr: WiktextractContext,
40 word_entry: WordEntry,
41 t_node: TemplateNode,
42 linkage_type: str,
43 source: str,
44) -> None:
45 expanded_node = wxr.wtp.parse(
46 wxr.wtp.node_to_wikitext(t_node), expand_all=True
47 )
48 for li_tag in expanded_node.find_html_recursively("li"):
49 l_data = []
50 for span_tag in li_tag.find_html("span"):
51 span_class = span_tag.attrs.get("class", "")
52 if "Latn" in span_class:
53 for data in l_data:
54 data.roman = clean_node(wxr, None, span_tag)
55 elif "lang" in span_tag.attrs:
56 word = clean_node(wxr, None, span_tag)
57 if word != "": 57 ↛ 50line 57 didn't jump to line 50 because the condition on line 57 was always true
58 l_data.append(Linkage(word=word, source=source))
59 if span_class == "Hant":
60 l_data[-1].tags.append("Traditional-Chinese")
61 elif span_class == "Hans":
62 l_data[-1].tags.append("Simplified-Chinese")
63 getattr(word_entry, linkage_type).extend(l_data)
66def extract_linkage_list_item(
67 wxr: WiktextractContext,
68 word_entry: WordEntry,
69 list_item: WikiNode,
70 linkage_type: str,
71 source: str,
72) -> None:
73 linkages = []
75 for index, node in enumerate(list_item.children):
76 if isinstance(node, TemplateNode) and node.template_name == "l":
77 l_data = Linkage(
78 word=clean_node(wxr, None, node.template_parameters.get(2, "")),
79 source=source,
80 )
81 if l_data.word != "": 81 ↛ 75line 81 didn't jump to line 75 because the condition on line 81 was always true
82 linkages.append(l_data)
83 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
84 for link_node in node.find_child(NodeKind.LINK):
85 link_str = clean_node(wxr, None, link_node)
86 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 86 ↛ 84line 86 didn't jump to line 84 because the condition on line 86 was always true
87 "อรรถาภิธาน:"
88 ):
89 extract_thesaurus_page(
90 wxr, word_entry, linkage_type, link_str
91 )
92 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
93 link_str = clean_node(wxr, None, node)
94 if link_str != "": 94 ↛ 75line 94 didn't jump to line 75 because the condition on line 94 was always true
95 linkages.append(Linkage(word=link_str))
96 elif isinstance(node, str) and ("-" in node or "–" in node):
97 if "-" in node: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true
98 sense = node[node.index("-") + 1 :]
99 elif "–" in node:
100 sense = node[node.index("–") + 1 :]
101 sense = clean_node(
102 wxr,
103 None,
104 [sense] + list_item.children[index + 1 :],
105 ).strip()
106 for l_data in linkages:
107 l_data.sense = sense
108 break
110 getattr(word_entry, linkage_type).extend(linkages)
113def extract_thesaurus_page(
114 wxr: WiktextractContext,
115 word_entry: WordEntry,
116 linkage_type: str,
117 page_title: str,
118) -> None:
119 page = wxr.wtp.get_page(page_title, 110)
120 if page is None or page.body is None: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 return
122 root = wxr.wtp.parse(page.body)
123 for level2_node in root.find_child(NodeKind.LEVEL2):
124 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
125 "ภาษา"
126 )
127 if lang_name != word_entry.lang: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 continue
129 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
130 pos_title = clean_node(wxr, None, level3_node.largs)
131 if pos_title != word_entry.pos_title: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 continue
133 for linkage_level_node in level3_node.find_child_recursively(
134 LEVEL_KIND_FLAGS
135 ):
136 linkage_title = clean_node(wxr, None, linkage_level_node.largs)
137 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
138 continue
139 extract_linkage_section(
140 wxr,
141 word_entry,
142 linkage_level_node,
143 linkage_type,
144 page_title,
145 )
148def extract_ws_template(
149 wxr: WiktextractContext,
150 word_entry: WordEntry,
151 t_node: TemplateNode,
152 linkage_type: str,
153 source: str,
154) -> None:
155 word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
156 if word != "": 156 ↛ exitline 156 didn't return from function 'extract_ws_template' because the condition on line 156 was always true
157 l_data = Linkage(word=word, source=source)
158 getattr(word_entry, linkage_type).append(l_data)
161LINKAGE_TEMPLATES = {
162 "syn": "synonyms",
163 "synonyms": "synonyms",
164 "synsee": "synonyms",
165 "ant": "antonyms",
166 "antonyms": "antonyms",
167 "cot": "coordinate_terms",
168 "coordinate terms": "coordinate_terms",
169 "hyper": "hypernyms",
170 "hypernyms": "hypernyms",
171 "hypo": "hyponyms",
172 "hyponyms": "hyponyms",
173}
176def extract_syn_template(
177 wxr: WiktextractContext,
178 word_entry: WordEntry,
179 t_node: TemplateNode,
180 linkage_type: str,
181) -> None:
182 for arg_name in count(2): 182 ↛ exitline 182 didn't return from function 'extract_syn_template' because the loop on line 182 didn't complete
183 if arg_name not in t_node.template_parameters:
184 break
185 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
186 if arg_value.startswith("อรรถาภิธาน:"): 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true
187 extract_thesaurus_page(wxr, word_entry, linkage_type, arg_value)
188 elif arg_value != "": 188 ↛ 182line 188 didn't jump to line 182 because the condition on line 188 was always true
189 getattr(word_entry, linkage_type).append(Linkage(word=arg_value))