Coverage for src/wiktextract/extractor/ko/linkage.py: 61%
122 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Linkage, WordEntry
8from .section_titles import LINKAGE_SECTIONS
9from .tags import translate_raw_tags
11LINKAGE_TEMPLATES = frozenset(["파생어 상자", "합성어 상자"])
14def extract_linkage_template(
15 wxr: WiktextractContext,
16 word_entry: WordEntry,
17 node: TemplateNode,
18 l_type: str,
19) -> bool:
20 # https://ko.wiktionary.org/wiki/틀:파생어_상자
21 # https://ko.wiktionary.org/wiki/틀:합성어_상자
22 added_data = False
23 if node.template_name in ["파생어 상자", "합성어 상자"]: 23 ↛ 38line 23 didn't jump to line 38 because the condition on line 23 was always true
24 for key in range(1, 41): 24 ↛ 41line 24 didn't jump to line 41 because the loop on line 24 didn't complete
25 if key not in node.template_parameters:
26 break
27 word = clean_node(wxr, None, node.template_parameters[key])
28 if word != "": 28 ↛ 24line 28 didn't jump to line 24 because the condition on line 28 was always true
29 getattr(word_entry, l_type).append(
30 Linkage(
31 word=word,
32 sense=word_entry.senses[-1].glosses[-1]
33 if len(word_entry.senses) > 0
34 else "",
35 )
36 )
37 added_data = True
38 elif re.fullmatch(r"col\d", node.template_name):
39 extract_col_template(wxr, word_entry, node, l_type)
41 return added_data
44def extract_linkage_section(
45 wxr: WiktextractContext,
46 word_entry: WordEntry,
47 level_node: LevelNode,
48 linkage_type: str,
49) -> None:
50 if linkage_type == "proverbs":
51 extract_proverb_section(wxr, word_entry, level_node)
52 else:
53 from .translation import extract_translation_template
55 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
56 extract_linkage_list_item(
57 wxr, word_entry, list_item, linkage_type, True
58 )
60 for t_node in level_node.find_child(NodeKind.TEMPLATE): 60 ↛ 61line 60 didn't jump to line 61 because the loop on line 60 never started
61 extract_linkage_template(wxr, word_entry, t_node, linkage_type)
62 if t_node.template_name == "외국어":
63 extract_translation_template(wxr, word_entry, t_node)
66def extract_linkage_list_item(
67 wxr: WiktextractContext,
68 word_entry: WordEntry,
69 list_item: WikiNode,
70 linkage_type: str,
71 in_linkage_section: bool,
72) -> None:
73 raw_tag = ""
74 is_roman = False
75 for child in list_item.children:
76 if isinstance(child, str):
77 if ":" in child:
78 l_type_str = child[: child.index(":")].strip()
79 if l_type_str in LINKAGE_SECTIONS: 79 ↛ 75line 79 didn't jump to line 75 because the condition on line 79 was always true
80 linkage_type = LINKAGE_SECTIONS[l_type_str]
81 else:
82 m = re.search(r"\(([^()]+)\)", child)
83 if m is not None:
84 raw_tag = m.group(1).strip()
85 is_roman = re.search(r"[a-z]", raw_tag) is not None
87 for link_node in list_item.find_child(NodeKind.LINK):
88 word = clean_node(wxr, None, link_node)
89 if word != "": 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true
90 linkage = Linkage(
91 word=word,
92 sense=word_entry.senses[-1].glosses[-1]
93 if len(word_entry.senses) > 0 and not in_linkage_section
94 else "",
95 )
96 if len(raw_tag) > 0:
97 if is_roman:
98 linkage.roman = raw_tag
99 elif re.fullmatch(r"\d+", raw_tag) is not None:
100 linkage.sense_index = raw_tag
101 else:
102 linkage.raw_tags.append(raw_tag)
103 translate_raw_tags(linkage)
104 getattr(word_entry, linkage_type).append(linkage)
106 if not list_item.contain_node(NodeKind.LINK): 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 word = clean_node(wxr, None, list_item.children)
108 if word != "":
109 linkage = Linkage(
110 word=word,
111 sense=word_entry.senses[-1].glosses[-1]
112 if len(word_entry.senses) > 0 and not in_linkage_section
113 else "",
114 )
115 translate_raw_tags(linkage)
116 getattr(word_entry, linkage_type).append(linkage)
119def extract_proverb_section(
120 wxr: WiktextractContext,
121 word_entry: WordEntry,
122 level_node: LevelNode,
123) -> None:
124 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
125 linkage = Linkage(word="")
126 for index, child in enumerate(list_item.children):
127 if isinstance(child, str) and ":" in child:
128 linkage.word = clean_node(wxr, None, list_item.children[:index])
129 linkage.word += child[: child.index(":")].strip()
130 linkage.sense = child[child.index(":") + 1 :].strip()
131 linkage.sense += clean_node(
132 wxr, None, list_item.children[index + 1 :]
133 )
134 break
135 if linkage.word != "":
136 word_entry.proverbs.append(linkage)
137 else:
138 for t_node in list_item.find_child(NodeKind.TEMPLATE):
139 if t_node.template_name in ["l", "연결"]: 139 ↛ 138line 139 didn't jump to line 138 because the condition on line 139 was always true
140 extract_l_template(wxr, word_entry, t_node, "proverbs")
143def extract_l_template(
144 wxr: WiktextractContext,
145 word_entry: WordEntry,
146 t_node: TemplateNode,
147 linkage_type: str,
148) -> None:
149 # https://ko.wiktionary.org/wiki/틀:연결
150 # https://en.wiktionary.org/wiki/Template:link
151 for word_arg in [3, 2]: 151 ↛ exitline 151 didn't return from function 'extract_l_template' because the loop on line 151 didn't complete
152 if word_arg in t_node.template_parameters:
153 word = clean_node(wxr, None, t_node.template_parameters[word_arg])
154 if word == "": 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 break
156 linkage = Linkage(word=word)
157 for sense_arg in ["t", 4]: 157 ↛ 163line 157 didn't jump to line 163 because the loop on line 157 didn't complete
158 if sense_arg in t_node.template_parameters: 158 ↛ 157line 158 didn't jump to line 157 because the condition on line 158 was always true
159 linkage.sense = clean_node(
160 wxr, None, t_node.template_parameters[sense_arg]
161 )
162 break
163 getattr(word_entry, linkage_type).append(linkage)
164 break
167def extract_col_template(
168 wxr: WiktextractContext,
169 word_entry: WordEntry,
170 t_node: TemplateNode,
171 l_type: str,
172):
173 linkage_list = []
174 expanded_template = wxr.wtp.parse(
175 wxr.wtp.node_to_wikitext(t_node), expand_all=True
176 )
177 for ui_tag in expanded_template.find_html_recursively("li"):
178 current_data = []
179 roman = ""
180 raw_tags = []
181 for span_tag in ui_tag.find_html("span"):
182 span_lang = span_tag.attrs.get("lang", "")
183 if span_lang.endswith("-Latn"):
184 roman = clean_node(wxr, None, span_tag)
185 elif "qualifier-content" in span_tag.attrs.get("class", ""):
186 span_text = clean_node(wxr, None, span_tag)
187 for raw_tag in span_text.split(","):
188 raw_tag = raw_tag.strip()
189 if raw_tag != "":
190 raw_tags.append(raw_tag)
191 elif span_lang != "":
192 l_data = Linkage(word=clean_node(wxr, None, span_tag))
193 class_names = span_tag.attrs.get("class", "")
194 if class_names == "Hant":
195 l_data.tags.append("Traditional-Chinese")
196 elif class_names == "Hans":
197 l_data.tags.append("Simplified-Chinese")
198 if l_data.word != "":
199 current_data.append(l_data)
201 for data in current_data:
202 data.raw_tags.extend(raw_tags)
203 data.roman = roman
204 translate_raw_tags(data)
205 linkage_list.extend(current_data)
207 getattr(word_entry, l_type).extend(linkage_list)