Coverage for src/wiktextract/extractor/ja/linkage.py: 85%
146 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor import (
3 HTMLNode,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ..ruby import extract_ruby
13from .models import Descendant, Form, Linkage, WordEntry
14from .section_titles import LINKAGES
15from .tags import translate_raw_tags
18def extract_linkage_section(
19 wxr: WiktextractContext,
20 word_entry: WordEntry,
21 level_node: LevelNode,
22 linkage_type: str,
23) -> None:
24 if linkage_type in ["cognates", "descendants"]:
25 extract_descendant_section(wxr, word_entry, level_node, linkage_type)
26 return
28 sense = ""
29 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
30 if isinstance(node, TemplateNode) and node.template_name.startswith(
31 "rel-top"
32 ):
33 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
34 elif node.kind == NodeKind.LIST: 34 ↛ 29line 34 didn't jump to line 29 because the condition on line 34 was always true
35 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):
36 linkage_type = process_linkage_list_item(
37 wxr, word_entry, list_item, linkage_type, sense
38 )
41def process_linkage_list_item(
42 wxr: WiktextractContext,
43 word_entry: WordEntry,
44 list_item: WikiNode,
45 linkage_type: str,
46 sense: str,
47) -> str:
48 after_colon = False
49 for node_idx, node in enumerate(list_item.children):
50 if isinstance(node, str) and ":" in node and not after_colon:
51 linkage_type_text = clean_node(
52 wxr, None, list_item.children[:node_idx]
53 )
54 linkage_type = LINKAGES.get(linkage_type_text, linkage_type)
55 after_colon = True
56 elif isinstance(node, TemplateNode) and node.template_name.startswith(
57 ("おくりがな", "ふりがな", "xlink")
58 ):
59 expanded_node = wxr.wtp.parse(
60 wxr.wtp.node_to_wikitext(node), expand_all=True
61 )
62 ruby, no_ruby = extract_ruby(wxr, expanded_node.children)
63 if node.template_name == "xlink":
64 ruby.clear()
65 word = clean_node(wxr, None, no_ruby)
66 if len(word) > 0: 66 ↛ 49line 66 didn't jump to line 49 because the condition on line 66 was always true
67 getattr(word_entry, linkage_type).append(
68 Linkage(word=word, ruby=ruby, sense=sense)
69 )
70 elif isinstance(node, TemplateNode) and node.template_name == "l": 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 l_data = extract_l_template(wxr, node)
72 if l_data.word != "":
73 getattr(word_entry, linkage_type).append(l_data)
74 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 getattr(word_entry, linkage_type).extend(
76 extract_zh_l_template(wxr, node)
77 )
78 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
79 word = clean_node(wxr, None, node)
80 if len(word) > 0:
81 getattr(word_entry, linkage_type).append(
82 Linkage(word=word, sense=sense)
83 )
84 elif isinstance(node, TemplateNode) and node.template_name == "sense":
85 sense = clean_node(wxr, None, node).strip("(): ")
87 return linkage_type
90def extract_descendant_section(
91 wxr: WiktextractContext,
92 word_entry: WordEntry,
93 level_node: LevelNode,
94 linkage_type: str,
95) -> None:
96 desc_list = []
97 for list_node in level_node.find_child(NodeKind.LIST):
98 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
99 desc_list.extend(process_desc_list_item(wxr, list_item, []))
100 getattr(word_entry, linkage_type).extend(desc_list)
103def process_desc_list_item(
104 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant]
105) -> list[Descendant]:
106 desc_list = []
107 lang_name = "unknown"
108 lang_code = "unknown"
109 for index, child in enumerate(list_item.children):
110 if isinstance(child, str) and ":" in child and lang_name == "unknown":
111 lang_name = clean_node(wxr, None, list_item.children[:index])
112 lang_code = name_to_code(lang_name, "ja")
113 elif isinstance(child, TemplateNode) and child.template_name == "etyl":
114 lang_name = clean_node(wxr, None, child)
115 lang_code = clean_node(
116 wxr, None, child.template_parameters.get(1, "")
117 )
118 elif isinstance(child, TemplateNode) and child.template_name == "l":
119 l_data = extract_l_template(wxr, child)
120 if l_data.word != "": 120 ↛ 109line 120 didn't jump to line 109 because the condition on line 120 was always true
121 desc_list.append(
122 Descendant(
123 word=l_data.word,
124 lang=lang_name,
125 lang_code=lang_code
126 or clean_node(
127 wxr, None, child.template_parameters.get(1, "")
128 ),
129 tags=l_data.tags,
130 raw_tags=l_data.raw_tags,
131 roman=l_data.roman,
132 sense=l_data.sense,
133 )
134 )
135 elif isinstance(child, TemplateNode) and child.template_name == "desc":
136 new_descs, lang_code, lang_name = extract_desc_template(wxr, child)
137 desc_list.extend(new_descs)
138 elif isinstance(child, TemplateNode) and child.template_name == "zh-l":
139 for l_data in extract_zh_l_template(wxr, child):
140 if l_data.word != "": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true
141 desc_list.append(
142 Descendant(
143 word=l_data.word,
144 lang=lang_name,
145 lang_code=lang_code,
146 tags=l_data.tags,
147 roman=l_data.roman,
148 )
149 )
150 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
151 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
152 process_desc_list_item(wxr, next_list_item, desc_list)
154 for p_data in parent_list:
155 p_data.descendants.extend(desc_list)
156 return desc_list
159# カテゴリ:文法テンプレート
160LINKAGE_TEMPLATES = {
161 "syn": "synonyms",
162 "ant": "antonyms",
163 "hyper": "hypernyms",
164 "hypo": "hyponyms",
165 "hyponyms": "hyponyms",
166 "mero": "meronyms",
167 "cot": "coordinate_terms",
168}
171def extract_gloss_list_linkage_template(
172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
173) -> None:
174 expanded_node = wxr.wtp.parse(
175 wxr.wtp.node_to_wikitext(t_node), expand_all=True
176 )
177 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
178 for span_tag in expanded_node.find_html(
179 "span", attr_name="lang", attr_value=lang_code
180 ):
181 word = clean_node(wxr, None, span_tag)
182 if word != "": 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true
183 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).append(
184 Linkage(
185 word=word,
186 sense=" ".join(word_entry.senses[-1].glosses)
187 if len(word_entry.senses) > 0
188 and len(word_entry.senses[-1].glosses) > 0
189 else "",
190 )
191 )
194def extract_l_template(
195 wxr: WiktextractContext, t_node: TemplateNode
196) -> Linkage:
197 # https://ja.wiktionary.org/wiki/テンプレート:l
198 expanded_node = wxr.wtp.parse(
199 wxr.wtp.node_to_wikitext(t_node), expand_all=True
200 )
201 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
202 l_data = Linkage(word="")
203 for span_tag in expanded_node.find_html("span"):
204 span_lang = span_tag.attrs.get("lang", "")
205 span_class = span_tag.attrs.get("class", "")
206 if span_lang == lang_code:
207 l_data.word = clean_node(wxr, None, span_tag)
208 elif span_lang == lang_code + "-Latn":
209 l_data.roman = clean_node(wxr, None, span_tag)
210 elif span_class == "gender":
211 raw_tag = clean_node(wxr, None, span_tag)
212 if raw_tag != "": 212 ↛ 203line 212 didn't jump to line 203 because the condition on line 212 was always true
213 l_data.raw_tags.append(raw_tag)
215 if "lit" in t_node.template_parameters: 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 l_data.literal_meaning = clean_node(
217 wxr, None, t_node.template_parameters["lit"]
218 )
219 for arg_name in (4, "gloss", "t"):
220 if arg_name in t_node.template_parameters:
221 l_data.sense = clean_node(
222 wxr, None, t_node.template_parameters[arg_name]
223 )
224 translate_raw_tags(l_data)
225 return l_data
228def extract_alt_form_section(
229 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
230) -> None:
231 for node in level_node.find_child_recursively(
232 NodeKind.LINK | NodeKind.TEMPLATE
233 ):
234 if node.kind == NodeKind.LINK:
235 word = clean_node(wxr, None, node)
236 if word != "":
237 word_entry.forms.append(Form(form=word, tags=["alternative"]))
238 elif isinstance(node, TemplateNode) and node.template_name == "l":
239 l_data = extract_l_template(wxr, node)
240 if l_data.word != "":
241 word_entry.forms.append(
242 Form(
243 form=l_data.word,
244 tags=l_data.tags,
245 raw_tags=l_data.raw_tags,
246 roman=l_data.roman,
247 literal_meaning=l_data.literal_meaning,
248 )
249 )
252def extract_desc_template(
253 wxr: WiktextractContext, t_node: TemplateNode
254) -> tuple[list[Descendant], str, str]:
255 d_list = []
256 expanded_node = wxr.wtp.parse(
257 wxr.wtp.node_to_wikitext(t_node), expand_all=True
258 )
259 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
260 lang_name = "unknown"
261 for node in expanded_node.children:
262 if isinstance(node, str) and node.strip().endswith(":"):
263 lang_name = node.strip(": ")
264 elif (
265 isinstance(node, HTMLNode)
266 and node.tag == "span"
267 and lang_code == node.attrs.get("lang", "")
268 ):
269 for link_node in node.find_child(NodeKind.LINK):
270 word = clean_node(wxr, None, link_node)
271 if word != "": 271 ↛ 269line 271 didn't jump to line 269 because the condition on line 271 was always true
272 d_list.append(
273 Descendant(
274 lang=lang_name, lang_code=lang_code, word=word
275 )
276 )
278 return d_list, lang_code, lang_name
281def extract_zh_l_template(
282 wxr: WiktextractContext, t_node: TemplateNode
283) -> list[Linkage]:
284 l_list = []
285 expanded_node = wxr.wtp.parse(
286 wxr.wtp.node_to_wikitext(t_node), expand_all=True
287 )
288 roman = ""
289 for i_tag in expanded_node.find_html("i"):
290 roman = clean_node(wxr, None, i_tag)
291 for index, span_tag in enumerate(
292 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
293 ):
294 word = clean_node(wxr, None, span_tag)
295 if word != "": 295 ↛ 291line 295 didn't jump to line 291 because the condition on line 295 was always true
296 l_list.append(
297 Linkage(
298 word=word,
299 tags=[
300 "Traditional Chinese"
301 if index == 0
302 else "Simplified Chinese"
303 ],
304 roman=roman,
305 )
306 )
307 return l_list