Coverage for src/wiktextract/extractor/th/linkage.py: 88%
93 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from itertools import count
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Linkage, WordEntry
14from .section_titles import LINKAGE_SECTIONS
17def extract_linkage_section(
18 wxr: WiktextractContext,
19 word_entry: WordEntry,
20 level_node: LevelNode,
21 linkage_type: str,
22 source: str = "",
23 sense: str = "",
24) -> None:
25 for node in level_node.children:
26 if isinstance(node, TemplateNode) and node.template_name.startswith(
27 "col"
28 ):
29 extract_col_template(
30 wxr, word_entry, node, linkage_type, source, sense
31 )
32 elif isinstance(node, TemplateNode) and node.template_name == "ws":
33 extract_ws_template(
34 wxr, word_entry, node, linkage_type, source, sense
35 )
36 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
37 for list_item in node.find_child(NodeKind.LIST_ITEM):
38 extract_linkage_list_item(
39 wxr, word_entry, list_item, linkage_type, source, sense
40 )
43def extract_col_template(
44 wxr: WiktextractContext,
45 word_entry: WordEntry,
46 t_node: TemplateNode,
47 linkage_type: str,
48 source: str,
49 sense: str,
50) -> None:
51 expanded_node = wxr.wtp.parse(
52 wxr.wtp.node_to_wikitext(t_node), expand_all=True
53 )
54 for li_tag in expanded_node.find_html_recursively("li"):
55 l_data = []
56 for span_tag in li_tag.find_html("span"):
57 span_class = span_tag.attrs.get("class", "")
58 if "Latn" in span_class:
59 for data in l_data:
60 data.roman = clean_node(wxr, None, span_tag)
61 elif "lang" in span_tag.attrs:
62 word = clean_node(wxr, None, span_tag)
63 if word != "": 63 ↛ 56line 63 didn't jump to line 56 because the condition on line 63 was always true
64 l_data.append(
65 Linkage(word=word, source=source, sense=sense)
66 )
67 if span_class == "Hant":
68 l_data[-1].tags.append("Traditional-Chinese")
69 elif span_class == "Hans":
70 l_data[-1].tags.append("Simplified-Chinese")
71 getattr(word_entry, linkage_type).extend(l_data)
74def extract_linkage_list_item(
75 wxr: WiktextractContext,
76 word_entry: WordEntry,
77 list_item: WikiNode,
78 linkage_type: str,
79 source: str,
80 sense: str,
81) -> None:
82 linkages = []
84 for index, node in enumerate(list_item.children):
85 if isinstance(node, TemplateNode) and node.template_name == "l":
86 l_data = Linkage(
87 word=clean_node(wxr, None, node.template_parameters.get(2, "")),
88 source=source,
89 sense=sense,
90 )
91 if l_data.word != "": 91 ↛ 84line 91 didn't jump to line 84 because the condition on line 91 was always true
92 linkages.append(l_data)
93 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
94 for link_node in node.find_child(NodeKind.LINK):
95 link_str = clean_node(wxr, None, link_node)
96 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true
97 "อรรถาภิธาน:"
98 ):
99 extract_thesaurus_page(
100 wxr, word_entry, linkage_type, link_str, sense
101 )
102 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
103 link_str = clean_node(wxr, None, node)
104 if link_str != "": 104 ↛ 84line 104 didn't jump to line 84 because the condition on line 104 was always true
105 linkages.append(Linkage(word=link_str, sense=sense))
106 elif isinstance(node, str) and ("-" in node or "–" in node):
107 if "-" in node: 107 ↛ 109line 107 didn't jump to line 109 because the condition on line 107 was always true
108 sense = node[node.index("-") + 1 :]
109 elif "–" in node:
110 sense = node[node.index("–") + 1 :]
111 sense = clean_node(
112 wxr,
113 None,
114 [sense] + list_item.children[index + 1 :],
115 ).strip()
116 for l_data in linkages:
117 l_data.sense = sense
118 break
120 getattr(word_entry, linkage_type).extend(linkages)
123def extract_thesaurus_page(
124 wxr: WiktextractContext,
125 word_entry: WordEntry,
126 linkage_type: str,
127 page_title: str,
128 sense: str,
129) -> None:
130 page = wxr.wtp.get_page(page_title, 110)
131 if page is None or page.body is None: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 return
133 root = wxr.wtp.parse(page.body)
134 for level2_node in root.find_child(NodeKind.LEVEL2):
135 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
136 "ภาษา"
137 )
138 if lang_name != word_entry.lang: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 continue
140 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
141 pos_title = clean_node(wxr, None, level3_node.largs)
142 if pos_title != word_entry.pos_title: 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 continue
144 for linkage_level_node in level3_node.find_child_recursively(
145 LEVEL_KIND_FLAGS
146 ):
147 linkage_title = clean_node(wxr, None, linkage_level_node.largs)
148 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
149 continue
150 extract_linkage_section(
151 wxr,
152 word_entry,
153 linkage_level_node,
154 linkage_type,
155 source=page_title,
156 sense=sense,
157 )
160def extract_ws_template(
161 wxr: WiktextractContext,
162 word_entry: WordEntry,
163 t_node: TemplateNode,
164 linkage_type: str,
165 source: str,
166 sense: str,
167) -> None:
168 word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
169 if word != "": 169 ↛ exitline 169 didn't return from function 'extract_ws_template' because the condition on line 169 was always true
170 l_data = Linkage(word=word, source=source, sense=sense)
171 getattr(word_entry, linkage_type).append(l_data)
174LINKAGE_TEMPLATES = {
175 "syn": "synonyms",
176 "synonyms": "synonyms",
177 "synsee": "synonyms",
178 "ant": "antonyms",
179 "antonyms": "antonyms",
180 "cot": "coordinate_terms",
181 "coordinate terms": "coordinate_terms",
182 "hyper": "hypernyms",
183 "hypernyms": "hypernyms",
184 "hypo": "hyponyms",
185 "hyponyms": "hyponyms",
186}
189def extract_syn_template(
190 wxr: WiktextractContext,
191 word_entry: WordEntry,
192 t_node: TemplateNode,
193 linkage_type: str,
194) -> None:
195 sense = " ".join(word_entry.senses[-1].glosses)
196 for arg_name in count(2): 196 ↛ exitline 196 didn't return from function 'extract_syn_template' because the loop on line 196 didn't complete
197 if arg_name not in t_node.template_parameters:
198 break
199 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])
200 if arg_value.startswith("อรรถาภิธาน:"): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 extract_thesaurus_page(
202 wxr, word_entry, linkage_type, arg_value, sense
203 )
204 elif arg_value != "": 204 ↛ 196line 204 didn't jump to line 196 because the condition on line 204 was always true
205 getattr(word_entry, linkage_type).append(
206 Linkage(word=arg_value, sense=sense)
207 )