Coverage for src/wiktextract/extractor/ja/linkage.py: 96%
80 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from mediawiki_langcodes import code_to_name, name_to_code
2from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from ..ruby import extract_ruby
7from .models import Descendant, Linkage, WordEntry
8from .section_titles import LINKAGES
9from .tags import translate_raw_tags
12def extract_linkage_section(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 level_node: LevelNode,
16 linkage_type: str,
17) -> None:
18 if linkage_type in ["cognates", "descendants"]:
19 extract_descendant_section(wxr, word_entry, level_node, linkage_type)
20 return
22 sense = ""
23 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
24 if isinstance(node, TemplateNode) and node.template_name.startswith(
25 "rel-top"
26 ):
27 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
28 elif node.kind == NodeKind.LIST: 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true
29 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM):
30 linkage_type = process_linkage_list_item(
31 wxr, word_entry, list_item, linkage_type, sense
32 )
35def process_linkage_list_item(
36 wxr: WiktextractContext,
37 word_entry: WordEntry,
38 list_item: WikiNode,
39 linkage_type: str,
40 sense: str,
41) -> str:
42 after_colon = False
43 for node_idx, node in enumerate(list_item.children):
44 if isinstance(node, str) and ":" in node and not after_colon:
45 linkage_type_text = clean_node(
46 wxr, None, list_item.children[:node_idx]
47 )
48 linkage_type = LINKAGES.get(linkage_type_text, linkage_type)
49 after_colon = True
50 elif isinstance(node, TemplateNode) and node.template_name.startswith(
51 ("おくりがな", "ふりがな", "xlink")
52 ):
53 expanded_node = wxr.wtp.parse(
54 wxr.wtp.node_to_wikitext(node), expand_all=True
55 )
56 ruby, no_ruby = extract_ruby(wxr, expanded_node.children)
57 if node.template_name == "xlink":
58 ruby.clear()
59 word = clean_node(wxr, None, no_ruby)
60 if len(word) > 0: 60 ↛ 43line 60 didn't jump to line 43 because the condition on line 60 was always true
61 getattr(word_entry, linkage_type).append(
62 Linkage(word=word, ruby=ruby, sense=sense)
63 )
64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
65 word = clean_node(wxr, None, node)
66 if len(word) > 0:
67 getattr(word_entry, linkage_type).append(
68 Linkage(word=word, sense=sense)
69 )
70 elif isinstance(node, TemplateNode) and node.template_name == "sense":
71 sense = clean_node(wxr, None, node).strip("(): ")
73 return linkage_type
76def extract_descendant_section(
77 wxr: WiktextractContext,
78 word_entry: WordEntry,
79 level_node: LevelNode,
80 linkage_type: str,
81) -> None:
82 desc_list = []
83 for list_node in level_node.find_child(NodeKind.LIST):
84 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
85 desc_list.extend(process_desc_list_item(wxr, list_item, []))
86 getattr(word_entry, linkage_type).extend(desc_list)
89def process_desc_list_item(
90 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant]
91) -> list[Descendant]:
92 desc_list = []
93 lang_name = ""
94 lang_code = ""
95 for index, child in enumerate(list_item.children):
96 if isinstance(child, str) and ":" in child:
97 lang_name = clean_node(wxr, None, list_item.children[:index])
98 lang_code = name_to_code(lang_name, "ja")
99 elif isinstance(child, TemplateNode) and child.template_name == "l":
100 # https://ja.wiktionary.org/wiki/テンプレート:l
101 l_args = {
102 2: "word",
103 3: "word",
104 4: "sense",
105 "gloss": "sense",
106 "t": "sense",
107 "tr": "roman",
108 }
109 if lang_code == "":
110 lang_code = clean_node(
111 wxr, None, child.template_parameters.get(1, "")
112 )
113 if lang_name == "": 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 lang_name = code_to_name(lang_code, "ja")
115 desc_data = Descendant(lang=lang_name, lang_code=lang_code)
116 for arg_name, field in l_args.items():
117 arg_value = clean_node(
118 wxr, None, child.template_parameters.get(arg_name, "")
119 )
120 if arg_value != "":
121 setattr(desc_data, field, arg_value)
122 expanded_node = wxr.wtp.parse(
123 wxr.wtp.node_to_wikitext(child), expand_all=True
124 )
125 for span_tag in expanded_node.find_html(
126 "span", attr_name="class", attr_value="gender"
127 ):
128 raw_tag = clean_node(wxr, None, span_tag)
129 if raw_tag != "": 129 ↛ 125line 129 didn't jump to line 125 because the condition on line 129 was always true
130 desc_data.raw_tags.append(raw_tag)
132 if desc_data.word != "": 132 ↛ 95line 132 didn't jump to line 95 because the condition on line 132 was always true
133 translate_raw_tags(desc_data)
134 desc_list.append(desc_data)
135 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
136 for next_list_item in child.find_child(NodeKind.LIST_ITEM):
137 process_desc_list_item(wxr, next_list_item, desc_list)
139 for p_data in parent_list:
140 p_data.descendants.extend(desc_list)
141 return desc_list