Coverage for src / wiktextract / extractor / zh / headword_line.py: 93%
141 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import strip_nodes
9from .models import Form, WordEntry
10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
13def extract_pos_head_line_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
15) -> None:
16 is_first_bold = True
17 for node in nodes:
18 if isinstance(node, TemplateNode):
19 if node.template_name in ["tlb", "term-label"]:
20 extract_tlb_template(wxr, word_entry, node)
21 else:
22 extract_headword_line_template(wxr, word_entry, node)
23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.BOLD
26 and is_first_bold
27 ):
28 process_headword_bold_node(wxr, word_entry, node)
29 is_first_bold = False
30 translate_raw_tags(word_entry)
33def extract_headword_line_template(
34 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
35) -> None:
36 # handle the first template in header line
37 template_name = t_node.template_name
38 if ( 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was never true
39 template_name != "head"
40 and not template_name.startswith(f"{word_entry.lang_code}-")
41 ) or template_name.endswith("-see"):
42 return
44 expanded_node = wxr.wtp.parse(
45 wxr.wtp.node_to_wikitext(t_node), expand_all=True
46 )
47 clean_node(wxr, word_entry, expanded_node)
48 forms_start_index = 0
49 nodes_after_span = []
50 for node in expanded_node.children:
51 if not (
52 isinstance(node, HTMLNode)
53 and node.tag == "span"
54 and "headword-line" in node.attrs.get("class", "").split()
55 ):
56 nodes_after_span.append(node)
57 continue
58 for index, span_child in node.find_child(NodeKind.HTML, True):
59 if span_child.tag == "span":
60 forms_start_index = index + 1
61 class_names = span_child.attrs.get("class", "").split()
62 if "headword-tr" in class_names:
63 form = clean_node(wxr, word_entry, span_child)
64 if form != "": 64 ↛ 58line 64 didn't jump to line 58 because the condition on line 64 was always true
65 word_entry.forms.append(
66 Form(form=form, tags=["romanization"])
67 )
68 elif "gender" in class_names:
69 for abbr_tag in span_child.find_html("abbr"):
70 gender = clean_node(wxr, None, abbr_tag)
71 if gender in TEMPLATE_TAG_ARGS: 71 ↛ 74line 71 didn't jump to line 74 because the condition on line 71 was always true
72 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender])
73 else:
74 word_entry.raw_tags.append(gender)
75 elif "ib-content" in class_names:
76 raw_tag = clean_node(wxr, None, span_child)
77 if raw_tag != "": 77 ↛ 58line 77 didn't jump to line 58 because the condition on line 77 was always true
78 word_entry.raw_tags.append(raw_tag)
79 else:
80 for strong_node in span_child.find_html(
81 "strong", attr_name="class", attr_value="headword"
82 ):
83 process_headword_bold_node(wxr, word_entry, strong_node)
84 elif (
85 span_child.tag == "strong"
86 and "headword" in span_child.attrs.get("class", "")
87 ):
88 forms_start_index = index + 1
89 process_headword_bold_node(wxr, word_entry, span_child)
90 elif span_child.tag == "sup" and word_entry.lang_code == "ja":
91 extract_historical_kana(wxr, word_entry, span_child)
92 forms_start_index = index + 1
93 elif span_child.tag == "i":
94 for i_child in span_child.children:
95 raw_tag = (
96 clean_node(wxr, None, i_child)
97 .removeprefix("^†")
98 .strip()
99 )
100 if raw_tag != "": 100 ↛ 94line 100 didn't jump to line 94 because the condition on line 100 was always true
101 word_entry.raw_tags.append(raw_tag)
102 if len(span_child.children) > 0:
103 forms_start_index = index + 1
104 elif span_child.tag == "b":
105 # this is a form <b> tag, already inside form parentheses
106 break
108 extract_headword_forms(
109 wxr, word_entry, node.children[forms_start_index:]
110 )
111 if len(nodes_after_span) > 0:
112 extract_headword_forms(wxr, word_entry, nodes_after_span)
115def process_headword_bold_node(
116 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode
117) -> None:
118 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)
119 form = clean_node(wxr, word_entry, node_without_ruby)
120 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0:
121 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 121 ↛ 125line 121 didn't jump to line 125 because the condition on line 121 was never true
122 # Unsupported titles:
123 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱
124 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱
125 word_entry.word = form
126 word_entry.original_title = wxr.wtp.title
127 else:
128 word_entry.forms.append(
129 Form(
130 form=clean_node(wxr, word_entry, node_without_ruby),
131 ruby=ruby_data,
132 tags=["canonical"],
133 )
134 )
137def extract_headword_forms(
138 wxr: WiktextractContext,
139 word_entry: WordEntry,
140 form_nodes: list[WikiNode | str],
141) -> None:
142 current_nodes = []
143 for node in form_nodes:
144 if isinstance(node, str) and node.startswith((",", ",")):
145 process_forms_text(wxr, word_entry, current_nodes)
146 current_nodes = [node[1:]]
147 else:
148 current_nodes.append(node)
150 if len(current_nodes) > 0:
151 process_forms_text(wxr, word_entry, current_nodes)
154def process_forms_text(
155 wxr: WiktextractContext,
156 word_entry: WordEntry,
157 form_nodes: list[WikiNode | str],
158) -> None:
159 tag_nodes = []
160 has_forms = False
161 striped_nodes = list(strip_nodes(form_nodes))
162 for index, node in enumerate(striped_nodes):
163 if (isinstance(node, HTMLNode) and node.tag == "b") or (
164 isinstance(node, WikiNode) and node.kind == NodeKind.BOLD
165 ):
166 has_forms = True
167 ruby_data = []
168 ruby_data, node_without_ruby = extract_ruby(wxr, node)
169 form = clean_node(wxr, None, node_without_ruby)
170 raw_form_tags = extract_headword_tags(
171 clean_node(wxr, None, tag_nodes).strip("() ")
172 )
173 form_tags = []
174 # check if next tag has gender data
175 if index < len(striped_nodes) - 1:
176 next_node = striped_nodes[index + 1]
177 if (
178 isinstance(next_node, WikiNode)
179 and next_node.kind == NodeKind.HTML
180 and next_node.tag == "span"
181 and "gender" in next_node.attrs.get("class", "")
182 ):
183 gender = clean_node(wxr, None, next_node)
184 if gender in TEMPLATE_TAG_ARGS: 184 ↛ 187line 184 didn't jump to line 187 because the condition on line 184 was always true
185 form_tags.append(TEMPLATE_TAG_ARGS[gender])
186 else:
187 raw_form_tags.append(gender)
189 for f_str in filter(None, map(str.strip, re.split(r"/|,", form))):
190 form_data = Form(
191 form=f_str,
192 raw_tags=raw_form_tags,
193 tags=form_tags,
194 ruby=ruby_data,
195 )
196 translate_raw_tags(form_data)
197 word_entry.forms.append(form_data)
198 elif (
199 isinstance(node, HTMLNode)
200 and node.tag == "span"
201 and "tr" in node.attrs.get("class", "").split()
202 and len(word_entry.forms) > 0
203 ):
204 # romanization of the previous form <b> tag
205 word_entry.forms[-1].roman = clean_node(wxr, None, node)
206 elif not (
207 isinstance(node, HTMLNode)
208 and node.tag == "span"
209 and "mention-gloss-paren" in node.attrs.get("class", "").split()
210 ):
211 tag_nodes.append(node)
213 if not has_forms:
214 tags_list = extract_headword_tags(
215 clean_node(wxr, word_entry, tag_nodes).strip("() ")
216 )
217 if len(tags_list) > 0:
218 word_entry.raw_tags.extend(tags_list)
219 translate_raw_tags(word_entry)
222def extract_headword_tags(tags_str: str) -> list[str]:
223 tags = []
224 for tag_str in filter(
225 None, (s.strip() for s in re.split("&|或|和", tags_str))
226 ):
227 tags.append(tag_str)
228 return tags
231def extract_historical_kana(
232 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode
233) -> None:
234 # https://zh.wiktionary.org/wiki/Template:ja-adj
235 # "hist" parameter
236 form = ""
237 roman = ""
238 for strong_node in sup_node.find_html("strong"):
239 form = clean_node(wxr, None, strong_node)
240 for span_node in sup_node.find_html(
241 "span", attr_name="class", attr_value="tr"
242 ):
243 roman = clean_node(wxr, None, span_node).strip("()")
244 if len(form) > 0: 244 ↛ exitline 244 didn't return from function 'extract_historical_kana' because the condition on line 244 was always true
245 word_entry.forms.append(Form(form=form, roman=roman, tags=["archaic"]))
248def extract_tlb_template(
249 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
250) -> None:
251 # https://zh.wiktionary.org/wiki/Template:Tlb
252 # https://en.wiktionary.org/wiki/Template:term-label
253 expanded_node = wxr.wtp.parse(
254 wxr.wtp.node_to_wikitext(t_node), expand_all=True
255 )
256 for span_tag in expanded_node.find_html_recursively(
257 "span", attr_name="class", attr_value="ib-content"
258 ):
259 for raw_tag in clean_node(wxr, None, span_tag).split(","):
260 raw_tag = raw_tag.strip()
261 if len(raw_tag) > 0: 261 ↛ 259line 261 didn't jump to line 259 because the condition on line 261 was always true
262 word_entry.raw_tags.append(raw_tag)
263 clean_node(wxr, word_entry, expanded_node)
264 translate_raw_tags(word_entry)