Coverage for src/wiktextract/extractor/zh/headword_line.py: 88%
130 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import strip_nodes
9from .models import Form, WordEntry
10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
13def extract_pos_head_line_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
15) -> None:
16 is_first_bold = True
17 for node in nodes:
18 if isinstance(node, TemplateNode):
19 if node.template_name in ["tlb", "term-label"]: 19 ↛ 20line 19 didn't jump to line 20 because the condition on line 19 was never true
20 extract_tlb_template(wxr, word_entry, node)
21 else:
22 extract_headword_line_template(wxr, word_entry, node)
23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.BOLD
26 and is_first_bold
27 ):
28 process_headword_bold_node(wxr, word_entry, node)
29 is_first_bold = False
32def extract_headword_line_template(
33 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
34) -> None:
35 # handle the first template in header line
36 template_name = t_node.template_name
37 if ( 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was never true
38 template_name != "head"
39 and not template_name.startswith(f"{word_entry.lang_code}-")
40 ) or template_name.endswith("-see"):
41 return
43 expanded_node = wxr.wtp.parse(
44 wxr.wtp.node_to_wikitext(t_node), expand_all=True
45 )
46 clean_node(wxr, word_entry, expanded_node)
47 forms_start_index = 0
48 for span_node in expanded_node.find_html(
49 "span", attr_name="class", attr_value="headword-line"
50 ):
51 for index, span_child in span_node.find_child(NodeKind.HTML, True):
52 if span_child.tag == "span":
53 forms_start_index = index + 1
54 class_names = span_child.attrs.get("class", "")
55 if "headword-tr" in class_names:
56 form = clean_node(wxr, word_entry, span_child)
57 if form != "": 57 ↛ 51line 57 didn't jump to line 51 because the condition on line 57 was always true
58 word_entry.forms.append(
59 Form(form=form, tags=["romanization"])
60 )
61 elif "gender" in class_names:
62 for abbr_tag in span_child.find_html("abbr"):
63 gender = abbr_tag.children[0]
64 if gender in TEMPLATE_TAG_ARGS: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true
65 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender])
66 else:
67 word_entry.raw_tags.append(gender)
68 translate_raw_tags(word_entry)
69 else:
70 for strong_node in span_child.find_html(
71 "strong", attr_name="class", attr_value="headword"
72 ):
73 process_headword_bold_node(wxr, word_entry, strong_node)
74 elif (
75 span_child.tag == "strong"
76 and "headword" in span_child.attrs.get("class", "")
77 ):
78 forms_start_index = index + 1
79 process_headword_bold_node(wxr, word_entry, span_child)
80 elif span_child.tag == "b":
81 # this is a form <b> tag, already inside form parentheses
82 break
84 extract_headword_forms(
85 wxr, word_entry, span_node.children[forms_start_index:]
86 )
89def process_headword_bold_node(
90 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode
91) -> None:
92 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)
93 form = clean_node(wxr, word_entry, node_without_ruby)
94 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0:
95 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 95 ↛ 99line 95 didn't jump to line 99 because the condition on line 95 was never true
96 # Unsupported titles:
97 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱
98 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱
99 word_entry.word = form
100 word_entry.original_title = wxr.wtp.title
101 else:
102 word_entry.forms.append(
103 Form(
104 form=clean_node(wxr, word_entry, node_without_ruby),
105 ruby=ruby_data,
106 tags=["canonical"],
107 )
108 )
111def extract_headword_forms(
112 wxr: WiktextractContext,
113 word_entry: WordEntry,
114 form_nodes: list[WikiNode | str],
115) -> None:
116 current_nodes = []
117 for node in form_nodes:
118 if isinstance(node, str) and node.startswith((",", ",")):
119 process_forms_text(wxr, word_entry, current_nodes)
120 current_nodes = [node[1:]]
121 else:
122 current_nodes.append(node)
124 if len(current_nodes) > 0:
125 process_forms_text(wxr, word_entry, current_nodes)
128def process_forms_text(
129 wxr: WiktextractContext,
130 word_entry: WordEntry,
131 form_nodes: list[WikiNode | str],
132) -> None:
133 tag_nodes = []
134 has_forms = False
135 striped_nodes = list(strip_nodes(form_nodes))
136 lang_code = word_entry.lang_code
137 for index, node in enumerate(striped_nodes):
138 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML:
139 if node.tag == "b":
140 has_forms = True
141 ruby_data = []
142 if lang_code == "ja": 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 ruby_data, node_without_ruby = extract_ruby(wxr, node)
144 form = clean_node(wxr, None, node_without_ruby)
145 else:
146 form = clean_node(wxr, None, node)
147 raw_form_tags = extract_headword_tags(
148 clean_node(wxr, None, tag_nodes).strip("() ")
149 )
150 form_tags = []
151 # check if next tag has gender data
152 if index < len(striped_nodes) - 1:
153 next_node = striped_nodes[index + 1]
154 if (
155 isinstance(next_node, WikiNode)
156 and next_node.kind == NodeKind.HTML
157 and next_node.tag == "span"
158 and "gender" in next_node.attrs.get("class", "")
159 ):
160 gender = clean_node(wxr, None, next_node)
161 if gender in TEMPLATE_TAG_ARGS: 161 ↛ 164line 161 didn't jump to line 164 because the condition on line 161 was always true
162 form_tags.append(TEMPLATE_TAG_ARGS[gender])
163 else:
164 raw_form_tags.append(gender)
166 for f_str in form.split("/"):
167 f_str = f_str.strip()
168 if f_str == "": 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 continue
170 form_data = Form(
171 form=f_str,
172 raw_tags=raw_form_tags,
173 tags=form_tags,
174 ruby=ruby_data,
175 )
176 translate_raw_tags(form_data)
177 word_entry.forms.append(form_data)
178 elif ( 178 ↛ 184line 178 didn't jump to line 184 because the condition on line 178 was never true
179 node.tag == "span"
180 and "tr" in node.attrs.get("class", "")
181 and len(word_entry.forms) > 0
182 ):
183 # romanization of the previous form <b> tag
184 word_entry.forms[-1].roman = clean_node(wxr, None, node)
185 elif node.tag == "sup" and lang_code == "ja":
186 extract_historical_kana(wxr, word_entry, node)
187 else:
188 tag_nodes.append(node)
189 else:
190 tag_nodes.append(node)
192 if not has_forms:
193 tags_list = extract_headword_tags(
194 clean_node(wxr, word_entry, tag_nodes).strip("() ")
195 )
196 if len(tags_list) > 0:
197 word_entry.raw_tags.extend(tags_list)
198 translate_raw_tags(word_entry)
201def extract_headword_tags(tags_str: str) -> list[str]:
202 tags = []
203 for tag_str in filter(
204 None, (s.strip() for s in re.split("&|或|和", tags_str))
205 ):
206 tags.append(tag_str)
207 return tags
210def extract_historical_kana(
211 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode
212) -> None:
213 # https://zh.wiktionary.org/wiki/Template:ja-adj
214 # "hist" parameter
215 form = ""
216 roman = ""
217 for strong_node in sup_node.find_html("strong"):
218 form = clean_node(wxr, None, strong_node)
219 for span_node in sup_node.find_html(
220 "span", attr_name="class", attr_value="tr"
221 ):
222 roman = clean_node(wxr, None, span_node).strip("()")
223 if len(form) > 0: 223 ↛ exitline 223 didn't return from function 'extract_historical_kana' because the condition on line 223 was always true
224 form_data = Form(form=form, roman=roman)
225 word_entry.forms.append(form_data)
228def extract_tlb_template(
229 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
230) -> None:
231 # https://zh.wiktionary.org/wiki/Template:Tlb
232 # https://en.wiktionary.org/wiki/Template:term-label
233 expanded_node = wxr.wtp.parse(
234 wxr.wtp.node_to_wikitext(t_node), expand_all=True
235 )
236 for span_tag in expanded_node.find_html_recursively(
237 "span", attr_name="class", attr_value="ib-content"
238 ):
239 raw_tag = clean_node(wxr, None, span_tag)
240 if len(raw_tag) > 0: 240 ↛ 236line 240 didn't jump to line 236 because the condition on line 240 was always true
241 word_entry.raw_tags.append(raw_tag)
242 clean_node(wxr, word_entry, expanded_node)
243 translate_raw_tags(word_entry)