Coverage for src / wiktextract / extractor / zh / headword_line.py: 92%
147 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import strip_nodes
9from .models import Classifier, Form, WordEntry
10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
13def extract_pos_head_line_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
15) -> None:
16 is_first_bold = True
17 for node in nodes:
18 if isinstance(node, TemplateNode):
19 if node.template_name in ["tlb", "term-label"]:
20 extract_tlb_template(wxr, word_entry, node)
21 else:
22 extract_headword_line_template(wxr, word_entry, node)
23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.BOLD
26 and is_first_bold
27 ):
28 process_headword_bold_node(wxr, word_entry, node)
29 is_first_bold = False
30 new_forms = []
31 for form in word_entry.forms:
32 if "分類詞" in form.raw_tags: 32 ↛ 33line 32 didn't jump to line 33 because the condition on line 32 was never true
33 word_entry.classifiers.append(
34 Classifier(
35 classifier=form.form, tags=form.tags, raw_tags=form.raw_tags
36 )
37 )
38 else:
39 new_forms.append(form)
40 word_entry.forms = new_forms
41 translate_raw_tags(word_entry)
44def extract_headword_line_template(
45 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
46) -> None:
47 # handle the first template in header line
48 template_name = t_node.template_name
49 if ( 49 ↛ 53line 49 didn't jump to line 53 because the condition on line 49 was never true
50 template_name != "head"
51 and not template_name.startswith(f"{word_entry.lang_code}-")
52 ) or template_name.endswith("-see"):
53 return
55 expanded_node = wxr.wtp.parse(
56 wxr.wtp.node_to_wikitext(t_node), expand_all=True
57 )
58 clean_node(wxr, word_entry, expanded_node)
59 forms_start_index = 0
60 nodes_after_span = []
61 for node in expanded_node.children:
62 if not (
63 isinstance(node, HTMLNode)
64 and node.tag == "span"
65 and "headword-line" in node.attrs.get("class", "").split()
66 ):
67 nodes_after_span.append(node)
68 continue
69 for index, span_child in node.find_child(NodeKind.HTML, True):
70 if span_child.tag == "span":
71 forms_start_index = index + 1
72 class_names = span_child.attrs.get("class", "").split()
73 if "headword-tr" in class_names:
74 form = clean_node(wxr, word_entry, span_child)
75 if form != "": 75 ↛ 69line 75 didn't jump to line 69 because the condition on line 75 was always true
76 word_entry.forms.append(
77 Form(form=form, tags=["romanization"])
78 )
79 elif "gender" in class_names:
80 for abbr_tag in span_child.find_html("abbr"):
81 gender = clean_node(wxr, None, abbr_tag)
82 if gender in TEMPLATE_TAG_ARGS: 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true
83 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender])
84 else:
85 word_entry.raw_tags.append(gender)
86 elif "ib-content" in class_names:
87 raw_tag = clean_node(wxr, None, span_child)
88 if raw_tag != "": 88 ↛ 69line 88 didn't jump to line 69 because the condition on line 88 was always true
89 word_entry.raw_tags.append(raw_tag)
90 else:
91 for strong_node in span_child.find_html(
92 "strong", attr_name="class", attr_value="headword"
93 ):
94 process_headword_bold_node(wxr, word_entry, strong_node)
95 elif (
96 span_child.tag == "strong"
97 and "headword" in span_child.attrs.get("class", "")
98 ):
99 forms_start_index = index + 1
100 process_headword_bold_node(wxr, word_entry, span_child)
101 elif span_child.tag == "sup" and word_entry.lang_code == "ja":
102 extract_historical_kana(wxr, word_entry, span_child)
103 forms_start_index = index + 1
104 elif span_child.tag == "i":
105 for i_child in span_child.children:
106 raw_tag = (
107 clean_node(wxr, None, i_child)
108 .removeprefix("^†")
109 .strip()
110 )
111 if raw_tag != "": 111 ↛ 105line 111 didn't jump to line 105 because the condition on line 111 was always true
112 word_entry.raw_tags.append(raw_tag)
113 if len(span_child.children) > 0:
114 forms_start_index = index + 1
115 elif span_child.tag == "b":
116 # this is a form <b> tag, already inside form parentheses
117 break
119 extract_headword_forms(
120 wxr, word_entry, node.children[forms_start_index:]
121 )
122 if len(nodes_after_span) > 0:
123 extract_headword_forms(wxr, word_entry, nodes_after_span)
126def process_headword_bold_node(
127 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode
128) -> None:
129 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)
130 form = clean_node(wxr, word_entry, node_without_ruby)
131 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0:
132 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 132 ↛ 136line 132 didn't jump to line 136 because the condition on line 132 was never true
133 # Unsupported titles:
134 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱
135 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱
136 word_entry.word = form
137 word_entry.original_title = wxr.wtp.title
138 else:
139 word_entry.forms.append(
140 Form(
141 form=clean_node(wxr, word_entry, node_without_ruby),
142 ruby=ruby_data,
143 tags=["canonical"],
144 )
145 )
148def extract_headword_forms(
149 wxr: WiktextractContext,
150 word_entry: WordEntry,
151 form_nodes: list[WikiNode | str],
152) -> None:
153 current_nodes = []
154 for node in form_nodes:
155 if isinstance(node, str) and node.startswith((",", ",")):
156 process_forms_text(wxr, word_entry, current_nodes)
157 current_nodes = [node[1:]]
158 else:
159 current_nodes.append(node)
161 if len(current_nodes) > 0:
162 process_forms_text(wxr, word_entry, current_nodes)
165def process_forms_text(
166 wxr: WiktextractContext,
167 word_entry: WordEntry,
168 form_nodes: list[WikiNode | str],
169) -> None:
170 tag_nodes = []
171 has_forms = False
172 striped_nodes = list(strip_nodes(form_nodes))
173 for index, node in enumerate(striped_nodes):
174 if (isinstance(node, HTMLNode) and node.tag == "b") or (
175 isinstance(node, WikiNode) and node.kind == NodeKind.BOLD
176 ):
177 has_forms = True
178 ruby_data = []
179 ruby_data, node_without_ruby = extract_ruby(wxr, node)
180 form = clean_node(wxr, None, node_without_ruby)
181 raw_form_tags = extract_headword_tags(
182 clean_node(wxr, None, tag_nodes).strip("() ")
183 )
184 form_tags = []
185 # check if next tag has gender data
186 if index < len(striped_nodes) - 1:
187 next_node = striped_nodes[index + 1]
188 if (
189 isinstance(next_node, WikiNode)
190 and next_node.kind == NodeKind.HTML
191 and next_node.tag == "span"
192 and "gender" in next_node.attrs.get("class", "")
193 ):
194 gender = clean_node(wxr, None, next_node)
195 if gender in TEMPLATE_TAG_ARGS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true
196 form_tags.append(TEMPLATE_TAG_ARGS[gender])
197 else:
198 raw_form_tags.append(gender)
200 for f_str in filter(None, map(str.strip, re.split(r"/|,", form))):
201 form_data = Form(
202 form=f_str,
203 raw_tags=raw_form_tags,
204 tags=form_tags,
205 ruby=ruby_data,
206 )
207 translate_raw_tags(form_data)
208 word_entry.forms.append(form_data)
209 elif (
210 isinstance(node, HTMLNode)
211 and node.tag == "span"
212 and "tr" in node.attrs.get("class", "").split()
213 and len(word_entry.forms) > 0
214 ):
215 # romanization of the previous form <b> tag
216 word_entry.forms[-1].roman = clean_node(wxr, None, node)
217 elif not (
218 isinstance(node, HTMLNode)
219 and node.tag == "span"
220 and "mention-gloss-paren" in node.attrs.get("class", "").split()
221 ):
222 tag_nodes.append(node)
224 if not has_forms:
225 tags_list = extract_headword_tags(
226 clean_node(wxr, word_entry, tag_nodes).strip("() ")
227 )
228 if len(tags_list) > 0:
229 word_entry.raw_tags.extend(tags_list)
230 translate_raw_tags(word_entry)
233def extract_headword_tags(tags_str: str) -> list[str]:
234 tags = []
235 for tag_str in filter(
236 None, (s.strip() for s in re.split("&|或|和", tags_str))
237 ):
238 tags.append(tag_str)
239 return tags
242def extract_historical_kana(
243 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode
244) -> None:
245 # https://zh.wiktionary.org/wiki/Template:ja-adj
246 # "hist" parameter
247 form = ""
248 roman = ""
249 for strong_node in sup_node.find_html("strong"):
250 form = clean_node(wxr, None, strong_node)
251 for span_node in sup_node.find_html(
252 "span", attr_name="class", attr_value="tr"
253 ):
254 roman = clean_node(wxr, None, span_node).strip("()")
255 if len(form) > 0: 255 ↛ exitline 255 didn't return from function 'extract_historical_kana' because the condition on line 255 was always true
256 word_entry.forms.append(Form(form=form, roman=roman, tags=["archaic"]))
259def extract_tlb_template(
260 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
261) -> None:
262 # https://zh.wiktionary.org/wiki/Template:Tlb
263 # https://en.wiktionary.org/wiki/Template:term-label
264 expanded_node = wxr.wtp.parse(
265 wxr.wtp.node_to_wikitext(t_node), expand_all=True
266 )
267 for span_tag in expanded_node.find_html_recursively(
268 "span", attr_name="class", attr_value="ib-content"
269 ):
270 for raw_tag in clean_node(wxr, None, span_tag).split(","):
271 raw_tag = raw_tag.strip()
272 if len(raw_tag) > 0: 272 ↛ 270line 272 didn't jump to line 270 because the condition on line 272 was always true
273 word_entry.raw_tags.append(raw_tag)
274 clean_node(wxr, word_entry, expanded_node)
275 translate_raw_tags(word_entry)