Coverage for src / wiktextract / extractor / ja / linkage.py: 90%
172 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-21 08:01 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-21 08:01 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor import (
3 HTMLNode,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ..ruby import extract_ruby
13from .models import Descendant, Form, Linkage, WordEntry
14from .section_titles import LINKAGES
15from .tags import translate_raw_tags
18def extract_linkage_section(
19 wxr: WiktextractContext,
20 word_entry: WordEntry,
21 level_node: LevelNode,
22 linkage_type: str,
23) -> None:
24 if linkage_type in ["cognates", "descendants"]:
25 extract_descendant_section(wxr, word_entry, level_node, linkage_type)
26 return
28 sense = ""
29 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
30 if isinstance(node, TemplateNode) and node.template_name.startswith(
31 "rel-top"
32 ):
33 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
34 elif node.kind == NodeKind.LIST: 34 ↛ 29line 34 didn't jump to line 29 because the condition on line 34 was always true
35 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):
36 linkage_type = process_linkage_list_item(
37 wxr, word_entry, list_item, linkage_type, sense
38 )
41def process_linkage_list_item(
42 wxr: WiktextractContext,
43 word_entry: WordEntry,
44 list_item: WikiNode,
45 linkage_type: str,
46 sense: str,
47) -> str:
48 after_colon = False
49 for node_idx, node in enumerate(list_item.children):
50 if isinstance(node, str) and ":" in node and not after_colon:
51 linkage_type_text = clean_node(
52 wxr, None, list_item.children[:node_idx]
53 )
54 linkage_type = LINKAGES.get(linkage_type_text, linkage_type)
55 after_colon = True
56 elif isinstance(node, TemplateNode) and node.template_name.startswith(
57 ("おくりがな", "ふりがな", "xlink")
58 ):
59 expanded_node = wxr.wtp.parse(
60 wxr.wtp.node_to_wikitext(node), expand_all=True
61 )
62 ruby, no_ruby = extract_ruby(wxr, expanded_node.children)
63 if node.template_name == "xlink":
64 ruby.clear()
65 word = clean_node(wxr, None, no_ruby)
66 if len(word) > 0: 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true
67 getattr(word_entry, linkage_type).append(
68 Linkage(word=word, ruby=ruby, sense=sense)
69 )
70 elif isinstance(node, TemplateNode) and node.template_name == "l": 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 l_data = extract_l_template(wxr, node)
72 if l_data.word != "":
73 getattr(word_entry, linkage_type).append(l_data)
74 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 getattr(word_entry, linkage_type).extend(
76 extract_zh_l_template(wxr, node)
77 )
78 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
79 word = clean_node(wxr, None, node)
80 if len(word) > 0:
81 getattr(word_entry, linkage_type).append(
82 Linkage(word=word, sense=sense)
83 )
84 elif isinstance(node, TemplateNode) and node.template_name == "sense":
85 sense = clean_node(wxr, None, node).strip("(): ")
87 return linkage_type
90def extract_descendant_section(
91 wxr: WiktextractContext,
92 word_entry: WordEntry,
93 level_node: LevelNode,
94 linkage_type: str,
95) -> None:
96 desc_list = []
97 for list_node in level_node.find_child(NodeKind.LIST):
98 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
99 desc_list.extend(process_desc_list_item(wxr, list_item, []))
100 getattr(word_entry, linkage_type).extend(desc_list)
103def process_desc_list_item(
104 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant]
105) -> list[Descendant]:
106 desc_list = []
107 lang_name = "unknown"
108 lang_code = "unknown"
109 for index, child in enumerate(list_item.children):
110 if isinstance(child, str) and ":" in child and lang_name == "unknown":
111 lang_name = clean_node(wxr, None, list_item.children[:index])
112 lang_code = name_to_code(lang_name, "ja")
113 elif isinstance(child, TemplateNode) and child.template_name == "etyl":
114 lang_name = clean_node(wxr, None, child)
115 lang_code = clean_node(
116 wxr, None, child.template_parameters.get(1, "")
117 )
118 elif isinstance(child, TemplateNode) and child.template_name == "l":
119 l_data = extract_l_template(wxr, child)
120 if l_data.word != "": 120 ↛ 109line 120 didn't jump to line 109 because the condition on line 120 was always true
121 desc_list.append(
122 Descendant(
123 word=l_data.word,
124 lang=lang_name,
125 lang_code=lang_code
126 or clean_node(
127 wxr, None, child.template_parameters.get(1, "")
128 ),
129 tags=l_data.tags,
130 raw_tags=l_data.raw_tags,
131 roman=l_data.roman,
132 sense=l_data.sense,
133 )
134 )
135 elif isinstance(child, TemplateNode) and child.template_name == "desc":
136 new_descs, lang_code, lang_name = extract_desc_template(wxr, child)
137 desc_list.extend(new_descs)
138 elif isinstance(child, TemplateNode) and child.template_name == "zh-l":
139 for l_data in extract_zh_l_template(wxr, child):
140 if l_data.word != "": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true
141 desc_list.append(
142 Descendant(
143 word=l_data.word,
144 lang=lang_name,
145 lang_code=lang_code,
146 tags=l_data.tags,
147 roman=l_data.roman,
148 )
149 )
150 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
151 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
152 process_desc_list_item(wxr, next_list_item, desc_list)
154 for p_data in parent_list:
155 p_data.descendants.extend(desc_list)
156 return desc_list
159# カテゴリ:文法テンプレート
160LINKAGE_TEMPLATES = {
161 "syn": "synonyms",
162 "ant": "antonyms",
163 "hyper": "hypernyms",
164 "hypo": "hyponyms",
165 "hyponyms": "hyponyms",
166 "mero": "meronyms",
167 "cot": "coordinate_terms",
168}
171def extract_gloss_list_linkage_template(
172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
173) -> None:
174 expanded_node = wxr.wtp.parse(
175 wxr.wtp.node_to_wikitext(t_node), expand_all=True
176 )
177 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
178 for span_tag in expanded_node.find_html(
179 "span", attr_name="lang", attr_value=lang_code
180 ):
181 word = clean_node(wxr, None, span_tag)
182 if word != "": 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true
183 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).append(
184 Linkage(
185 word=word,
186 sense=" ".join(word_entry.senses[-1].glosses)
187 if len(word_entry.senses) > 0
188 and len(word_entry.senses[-1].glosses) > 0
189 else "",
190 )
191 )
194def extract_l_template(
195 wxr: WiktextractContext, t_node: TemplateNode
196) -> Linkage:
197 # https://ja.wiktionary.org/wiki/テンプレート:l
198 expanded_node = wxr.wtp.parse(
199 wxr.wtp.node_to_wikitext(t_node), expand_all=True
200 )
201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
202 l_data = Linkage(word="")
203 for span_tag in expanded_node.find_html("span"):
204 span_lang = span_tag.attrs.get("lang", "")
205 span_class = span_tag.attrs.get("class", "")
206 if span_lang == lang_code:
207 l_data.word = clean_node(wxr, None, span_tag)
208 elif span_lang == lang_code + "-Latn":
209 l_data.roman = clean_node(wxr, None, span_tag)
210 elif span_class == "gender":
211 raw_tag = clean_node(wxr, None, span_tag)
212 if raw_tag != "": 212 ↛ 203line 212 didn't jump to line 203 because the condition on line 212 was always true
213 l_data.raw_tags.append(raw_tag)
215 if "lit" in t_node.template_parameters: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 l_data.literal_meaning = clean_node(
217 wxr, None, t_node.template_parameters["lit"]
218 )
219 for arg_name in (4, "gloss", "t"):
220 if arg_name in t_node.template_parameters:
221 l_data.sense = clean_node(
222 wxr, None, t_node.template_parameters[arg_name]
223 )
224 translate_raw_tags(l_data)
225 return l_data
228def extract_alt_form_section(
229 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
230) -> None:
231 forms = []
232 parentheses = 0
233 tag_nodes = []
235 def add_tag():
236 if len(forms) > 0 and len(tag_nodes) > 0:
237 raw_tag = clean_node(wxr, None, tag_nodes).strip("()() ")
238 if raw_tag != "": 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was always true
239 forms[-1].raw_tags.append(raw_tag)
240 translate_raw_tags(forms[-1])
241 tag_nodes.clear()
243 for list_node in level_node.find_child(NodeKind.LIST):
244 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
245 for node in list_item.children:
246 if (
247 isinstance(node, WikiNode)
248 and node.kind == NodeKind.LINK
249 and parentheses == 0
250 ):
251 word = clean_node(wxr, None, node)
252 if word != "": 252 ↛ 245line 252 didn't jump to line 245 because the condition on line 252 was always true
253 forms.append(Form(form=word, tags=["alternative"]))
254 add_tag()
255 elif ( 255 ↛ 258line 255 didn't jump to line 258 because the condition on line 255 was never true
256 isinstance(node, TemplateNode) and node.template_name == "l"
257 ):
258 l_data = extract_l_template(wxr, node)
259 if l_data.word != "":
260 forms.append(
261 Form(
262 form=l_data.word,
263 tags=l_data.tags + ["alternative"],
264 raw_tags=l_data.raw_tags,
265 roman=l_data.roman,
266 literal_meaning=l_data.literal_meaning,
267 )
268 )
269 add_tag()
270 elif (
271 isinstance(node, str)
272 and node.strip().startswith(("(", "("))
273 and node.strip().endswith((")", ")"))
274 ):
275 tag_nodes.append(node)
276 elif isinstance(node, str) and ("(" in node or "(" in node):
277 parentheses += 1
278 tag_nodes.append(node)
279 elif isinstance(node, str) and (")" in node or ")" in node):
280 parentheses -= 1
281 tag_nodes.append(node)
282 elif parentheses > 0: 282 ↛ 245line 282 didn't jump to line 245 because the condition on line 282 was always true
283 tag_nodes.append(node)
284 add_tag()
285 word_entry.forms.extend(forms)
288def extract_desc_template(
289 wxr: WiktextractContext, t_node: TemplateNode
290) -> tuple[list[Descendant], str, str]:
291 d_list = []
292 expanded_node = wxr.wtp.parse(
293 wxr.wtp.node_to_wikitext(t_node), expand_all=True
294 )
295 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
296 lang_name = "unknown"
297 for node in expanded_node.children:
298 if isinstance(node, str) and node.strip().endswith(":"):
299 lang_name = node.strip(": ")
300 elif (
301 isinstance(node, HTMLNode)
302 and node.tag == "span"
303 and lang_code == node.attrs.get("lang", "")
304 ):
305 for link_node in node.find_child(NodeKind.LINK):
306 word = clean_node(wxr, None, link_node)
307 if word != "": 307 ↛ 305line 307 didn't jump to line 305 because the condition on line 307 was always true
308 d_list.append(
309 Descendant(
310 lang=lang_name, lang_code=lang_code, word=word
311 )
312 )
314 return d_list, lang_code, lang_name
317def extract_zh_l_template(
318 wxr: WiktextractContext, t_node: TemplateNode
319) -> list[Linkage]:
320 l_list = []
321 expanded_node = wxr.wtp.parse(
322 wxr.wtp.node_to_wikitext(t_node), expand_all=True
323 )
324 roman = ""
325 for i_tag in expanded_node.find_html("i"):
326 roman = clean_node(wxr, None, i_tag)
327 for index, span_tag in enumerate(
328 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
329 ):
330 word = clean_node(wxr, None, span_tag)
331 if word != "": 331 ↛ 327line 331 didn't jump to line 327 because the condition on line 331 was always true
332 l_list.append(
333 Linkage(
334 word=word,
335 tags=[
336 "Traditional-Chinese"
337 if index == 0
338 else "Simplified-Chinese"
339 ],
340 roman=roman,
341 )
342 )
343 return l_list