Coverage for src/wiktextract/extractor/zh/linkage.py: 87%
169 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import itertools
2from collections import defaultdict
4from wikitextprocessor.parser import (
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..ruby import extract_ruby
14from .models import Linkage, WordEntry
15from .tags import translate_raw_tags
18def extract_linkage_section(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 level_node: LevelNode,
22 linkage_type: str,
23) -> None:
24 sense = ""
25 linkage_list = []
26 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
27 if node.kind == NodeKind.LIST:
28 for item_node in node.find_child(NodeKind.LIST_ITEM):
29 sense, new_linkage_list = process_linkage_list_item(
30 wxr, item_node, sense
31 )
32 linkage_list.extend(new_linkage_list)
33 elif isinstance(node, TemplateNode): 33 ↛ 26line 33 didn't jump to line 26 because the condition on line 33 was always true
34 if node.template_name in ["s", "sense"]:
35 sense = clean_node(wxr, None, node).strip("(): ")
36 elif node.template_name == "zh-dial":
37 linkage_list.extend(extract_zh_dial_template(wxr, node, sense))
38 elif node.template_name.endswith("-saurus"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 linkage_list.extend(
40 extract_saurus_template(
41 wxr, page_data, node, linkage_type, sense
42 )
43 )
44 elif node.template_name.startswith("col"): 44 ↛ 26line 44 didn't jump to line 26 because the condition on line 44 was always true
45 linkage_list.extend(
46 process_linkage_col_template(wxr, node, sense)
47 )
49 if level_node.kind == NodeKind.LEVEL3:
50 for data in page_data:
51 if data.lang_code == page_data[-1].lang_code: 51 ↛ 50line 51 didn't jump to line 50 because the condition on line 51 was always true
52 pre_linkage_list = getattr(data, linkage_type)
53 pre_linkage_list.extend(linkage_list)
54 elif len(page_data) > 0: 54 ↛ exitline 54 didn't return from function 'extract_linkage_section' because the condition on line 54 was always true
55 pre_linkage_list = getattr(page_data[-1], linkage_type)
56 pre_linkage_list.extend(linkage_list)
59def process_linkage_list_item(
60 wxr: WiktextractContext, list_item: WikiNode, sense: str
61) -> tuple[str, list[Linkage]]:
62 raw_tags = []
63 linkage_list = []
64 for item_child in list_item.children:
65 if isinstance(item_child, TemplateNode):
66 if item_child.template_name in ["s", "sense"]:
67 sense = clean_node(wxr, None, item_child).strip("(): ")
68 elif item_child.template_name in ["qualifier", "qual"]:
69 raw_tags.append(clean_node(wxr, None, item_child).strip("()"))
70 elif item_child.template_name == "zh-l":
71 linkage_list.extend(
72 process_zh_l_template(wxr, item_child, sense, raw_tags)
73 )
74 raw_tags.clear()
75 elif item_child.template_name == "ja-r":
76 linkage_list.append(
77 process_ja_r_template(wxr, item_child, sense, raw_tags)
78 )
79 raw_tags.clear()
80 elif item_child.template_name in ["l", "link", "alter"]: 80 ↛ 64line 80 didn't jump to line 64 because the condition on line 80 was always true
81 linkage_list.extend(
82 process_l_template(wxr, item_child, sense, raw_tags)
83 )
84 raw_tags.clear()
85 elif (
86 isinstance(item_child, WikiNode)
87 and item_child.kind == NodeKind.LINK
88 ):
89 word = clean_node(wxr, None, item_child)
90 if len(word) > 0: 90 ↛ 64line 90 didn't jump to line 64 because the condition on line 90 was always true
91 linkage_data = Linkage(
92 word=word, sense=sense, raw_tags=raw_tags
93 )
94 translate_raw_tags(linkage_data)
95 linkage_list.append(linkage_data)
96 raw_tags.clear()
97 return sense, linkage_list
100def extract_saurus_template(
101 wxr: WiktextractContext,
102 page_data: list[WordEntry],
103 node: TemplateNode,
104 linkage_type: str,
105 sense: str,
106) -> list[Linkage]:
107 """
108 Extract data from template names end with "-saurus", like "zh-syn-saurus"
109 and "zh-ant-saurus". These templates get data from thesaurus pages, search
110 the thesaurus database to avoid parse these pages again.
112 https://zh.wiktionary.org/wiki/Template:Syn-saurus
113 """
114 from wiktextract.thesaurus import search_thesaurus
116 linkage_data = []
117 if node.template_name in ("zh-syn-saurus", "zh-ant-saurus"):
118 # obsolete templates
119 thesaurus_page_title = node.template_parameters.get(1)
120 else:
121 thesaurus_page_title = node.template_parameters.get(2)
123 for thesaurus in search_thesaurus(
124 wxr.thesaurus_db_conn,
125 thesaurus_page_title,
126 page_data[-1].lang_code,
127 page_data[-1].pos,
128 linkage_type,
129 ):
130 if thesaurus.term == wxr.wtp.title:
131 continue
132 linkage_data.append(
133 Linkage(
134 word=thesaurus.term,
135 roman=thesaurus.roman,
136 tags=thesaurus.tags,
137 raw_tags=thesaurus.raw_tags,
138 sense=sense,
139 )
140 )
142 return linkage_data
145def extract_zh_dial_template(
146 wxr: WiktextractContext, template_node: TemplateNode, sense: str
147) -> list[Linkage]:
148 linkage_list = []
149 dial_data = defaultdict(set)
150 tag_data = defaultdict(set)
151 expanded_node = wxr.wtp.parse(
152 wxr.wtp.node_to_wikitext(template_node), expand_all=True
153 )
154 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):
155 lang_tag = ""
156 region_tag = ""
157 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
158 if not row_node.contain_node(NodeKind.TABLE_CELL):
159 continue # skip header row
160 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
161 lang_tag = clean_node(wxr, None, header_node)
162 if lang_tag == "註解": # skip last note row
163 continue
164 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
165 for link_node in cell_node.find_child(NodeKind.LINK):
166 region_tag = clean_node(wxr, None, link_node)
167 word = ""
168 for span_tag in cell_node.find_html("span"):
169 span_text = clean_node(wxr, None, span_tag)
170 if span_text == "": 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true
171 continue
172 if (
173 span_tag.attrs.get("lang", "") == "zh"
174 and span_text != wxr.wtp.title
175 ):
176 word = span_text
177 if lang_tag != "": 177 ↛ 179line 177 didn't jump to line 179 because the condition on line 177 was always true
178 dial_data[span_text].add(lang_tag)
179 if region_tag != "":
180 dial_data[span_text].add(region_tag)
181 elif (
182 span_tag.attrs.get("style", "") == "font-size:60%"
183 and word != ""
184 ):
185 tag_data[word].add(span_text)
187 for term, lang_tags in dial_data.items():
188 linkage_data = Linkage(word=term, sense=sense, raw_tags=list(lang_tags))
189 linkage_data.raw_tags.extend(list(tag_data.get(term, {})))
190 translate_raw_tags(linkage_data)
191 linkage_list.append(linkage_data)
192 return linkage_list
195def process_zh_l_template(
196 wxr: WiktextractContext,
197 template_node: TemplateNode,
198 sense: str,
199 raw_tags: list[str] = [],
200) -> list[Linkage]:
201 # https://zh.wiktionary.org/wiki/Template:Zh-l
202 expanded_node = wxr.wtp.parse(
203 wxr.wtp.node_to_wikitext(template_node), expand_all=True
204 )
205 roman = ""
206 linkage_list = []
207 for i_tag in expanded_node.find_html_recursively(
208 "span", attr_name="class", attr_value="Latn"
209 ):
210 roman = clean_node(wxr, None, i_tag)
211 for span_tag in expanded_node.find_html(
212 "span", attr_name="lang", attr_value="zh"
213 ):
214 linkage_data = Linkage(
215 sense=sense,
216 raw_tags=raw_tags,
217 roman=roman,
218 word=clean_node(wxr, None, span_tag),
219 )
220 lang_attr = span_tag.attrs.get("lang", "")
221 if lang_attr == "zh-Hant":
222 linkage_data.tags.append("Traditional Chinese")
223 elif lang_attr == "zh-Hans":
224 linkage_data.tags.append("Simplified Chinese")
225 if len(linkage_data.word) > 0 and linkage_data.word != "/":
226 translate_raw_tags(linkage_data)
227 linkage_list.append(linkage_data)
228 return linkage_list
231def process_ja_r_template(
232 wxr: WiktextractContext,
233 template_node: TemplateNode,
234 sense: str,
235 raw_tags: list[str] = [],
236) -> Linkage:
237 # https://zh.wiktionary.org/wiki/Template:Ja-r
238 expanded_node = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(template_node), expand_all=True
240 )
241 linkage_data = Linkage(sense=sense, raw_tags=raw_tags)
242 for span_node in expanded_node.find_html("span"):
243 span_class = span_node.attrs.get("class", "")
244 if "lang" in span_node.attrs:
245 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)
246 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)
247 linkage_data.ruby = ruby_data
248 elif "tr" in span_class:
249 linkage_data.roman = clean_node(wxr, None, span_node)
250 elif "mention-gloss" == span_class: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 linkage_data.sense = clean_node(wxr, None, span_node)
253 translate_raw_tags(linkage_data)
254 return linkage_data
257def process_l_template(
258 wxr: WiktextractContext,
259 template_node: TemplateNode,
260 sense: str,
261 raw_tags: list[str] = [],
262) -> None:
263 # https://zh.wiktionary.org/wiki/Template:l
264 expanded_node = wxr.wtp.parse(
265 wxr.wtp.node_to_wikitext(template_node), expand_all=True
266 )
267 linkage_list = []
268 for span_tag in expanded_node.find_html("span", attr_name="lang"):
269 linkage_data = Linkage(
270 sense=sense, raw_tags=raw_tags, word=clean_node(wxr, None, span_tag)
271 )
272 if len(linkage_data.word) > 0: 272 ↛ 268line 272 didn't jump to line 268 because the condition on line 272 was always true
273 translate_raw_tags(linkage_data)
274 linkage_list.append(linkage_data)
275 return linkage_list
278def process_linkage_col_template(
279 wxr: WiktextractContext, template_node: TemplateNode, sense: str
280) -> list[Linkage]:
281 from .thesaurus import process_col_template
283 linkage_list = []
284 for data in process_col_template(wxr, "", "", "", "", "", template_node):
285 linkage_data = Linkage(
286 word=data.term,
287 roman=data.roman,
288 tags=data.tags,
289 raw_tags=data.raw_tags,
290 sense=sense,
291 )
292 if len(linkage_data.word) > 0: 292 ↛ 284line 292 didn't jump to line 284 because the condition on line 292 was always true
293 translate_raw_tags(linkage_data)
294 linkage_list.append(linkage_data)
295 return linkage_list
298def process_linkage_templates_in_gloss(
299 wxr: WiktextractContext,
300 page_data: list[WordEntry],
301 template_node: TemplateNode,
302 linkage_type: str,
303 sense: str,
304) -> None:
305 for word_index in itertools.count(2): 305 ↛ exitline 305 didn't return from function 'process_linkage_templates_in_gloss' because the loop on line 305 didn't complete
306 if word_index not in template_node.template_parameters:
307 break
308 word = clean_node(
309 wxr, None, template_node.template_parameters[word_index]
310 )
311 if len(word) == 0: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true
312 continue
313 if word.startswith(wxr.wtp.NAMESPACE_DATA["Thesaurus"]["name"] + ":"): 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true
314 continue
315 linkage = Linkage(word=word, sense=sense)
316 pre_data = getattr(page_data[-1], linkage_type)
317 pre_data.append(linkage)