Coverage for src/wiktextract/extractor/zh/headword_line.py: 90%
136 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import strip_nodes
9from .models import Form, WordEntry
10from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
13def extract_pos_head_line_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
15) -> None:
16 is_first_bold = True
17 for node in nodes:
18 if isinstance(node, TemplateNode):
19 if node.template_name in ["tlb", "term-label"]:
20 extract_tlb_template(wxr, word_entry, node)
21 else:
22 extract_headword_line_template(wxr, word_entry, node)
23 elif ( 23 ↛ 28line 23 didn't jump to line 28 because the condition on line 23 was never true
24 isinstance(node, WikiNode)
25 and node.kind == NodeKind.BOLD
26 and is_first_bold
27 ):
28 process_headword_bold_node(wxr, word_entry, node)
29 is_first_bold = False
32def extract_headword_line_template(
33 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
34) -> None:
35 # handle the first template in header line
36 template_name = t_node.template_name
37 if ( 37 ↛ 41line 37 didn't jump to line 41 because the condition on line 37 was never true
38 template_name != "head"
39 and not template_name.startswith(f"{word_entry.lang_code}-")
40 ) or template_name.endswith("-see"):
41 return
43 expanded_node = wxr.wtp.parse(
44 wxr.wtp.node_to_wikitext(t_node), expand_all=True
45 )
46 clean_node(wxr, word_entry, expanded_node)
47 forms_start_index = 0
48 for span_node in expanded_node.find_html(
49 "span", attr_name="class", attr_value="headword-line"
50 ):
51 for index, span_child in span_node.find_child(NodeKind.HTML, True):
52 if span_child.tag == "span":
53 forms_start_index = index + 1
54 class_names = span_child.attrs.get("class", "").split()
55 if "headword-tr" in class_names:
56 form = clean_node(wxr, word_entry, span_child)
57 if form != "": 57 ↛ 51line 57 didn't jump to line 51 because the condition on line 57 was always true
58 word_entry.forms.append(
59 Form(form=form, tags=["romanization"])
60 )
61 elif "gender" in class_names:
62 for abbr_tag in span_child.find_html("abbr"):
63 gender = clean_node(wxr, None, abbr_tag)
64 if gender in TEMPLATE_TAG_ARGS: 64 ↛ 67line 64 didn't jump to line 67 because the condition on line 64 was always true
65 word_entry.tags.append(TEMPLATE_TAG_ARGS[gender])
66 else:
67 word_entry.raw_tags.append(gender)
68 translate_raw_tags(word_entry)
69 elif "ib-content" in class_names:
70 raw_tag = clean_node(wxr, None, span_child)
71 if raw_tag != "": 71 ↛ 51line 71 didn't jump to line 51 because the condition on line 71 was always true
72 word_entry.raw_tags.append(raw_tag)
73 translate_raw_tags(word_entry)
74 else:
75 for strong_node in span_child.find_html(
76 "strong", attr_name="class", attr_value="headword"
77 ):
78 process_headword_bold_node(wxr, word_entry, strong_node)
79 elif (
80 span_child.tag == "strong"
81 and "headword" in span_child.attrs.get("class", "")
82 ):
83 forms_start_index = index + 1
84 process_headword_bold_node(wxr, word_entry, span_child)
85 elif span_child.tag == "b":
86 # this is a form <b> tag, already inside form parentheses
87 break
89 extract_headword_forms(
90 wxr, word_entry, span_node.children[forms_start_index:]
91 )
94def process_headword_bold_node(
95 wxr: WiktextractContext, word_entry: WordEntry, strong_node: HTMLNode
96) -> None:
97 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)
98 form = clean_node(wxr, word_entry, node_without_ruby)
99 if (len(ruby_data) > 0 or form != word_entry.word) and len(form) > 0:
100 if wxr.wtp.title.startswith("不支援的頁面名稱/"): 100 ↛ 104line 100 didn't jump to line 104 because the condition on line 100 was never true
101 # Unsupported titles:
102 # https://zh.wiktionary.org/wiki/Appendix:不支援的頁面名稱
103 # https://zh.wiktionary.org/wiki/Special:PrefixIndex/不支援的頁面名稱
104 word_entry.word = form
105 word_entry.original_title = wxr.wtp.title
106 else:
107 word_entry.forms.append(
108 Form(
109 form=clean_node(wxr, word_entry, node_without_ruby),
110 ruby=ruby_data,
111 tags=["canonical"],
112 )
113 )
116def extract_headword_forms(
117 wxr: WiktextractContext,
118 word_entry: WordEntry,
119 form_nodes: list[WikiNode | str],
120) -> None:
121 current_nodes = []
122 for node in form_nodes:
123 if isinstance(node, str) and node.startswith((",", ",")):
124 process_forms_text(wxr, word_entry, current_nodes)
125 current_nodes = [node[1:]]
126 else:
127 current_nodes.append(node)
129 if len(current_nodes) > 0:
130 process_forms_text(wxr, word_entry, current_nodes)
133def process_forms_text(
134 wxr: WiktextractContext,
135 word_entry: WordEntry,
136 form_nodes: list[WikiNode | str],
137) -> None:
138 tag_nodes = []
139 has_forms = False
140 striped_nodes = list(strip_nodes(form_nodes))
141 lang_code = word_entry.lang_code
142 for index, node in enumerate(striped_nodes):
143 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML:
144 if node.tag == "b":
145 has_forms = True
146 ruby_data = []
147 if lang_code == "ja": 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 ruby_data, node_without_ruby = extract_ruby(wxr, node)
149 form = clean_node(wxr, None, node_without_ruby)
150 else:
151 form = clean_node(wxr, None, node)
152 raw_form_tags = extract_headword_tags(
153 clean_node(wxr, None, tag_nodes).strip("() ")
154 )
155 form_tags = []
156 # check if next tag has gender data
157 if index < len(striped_nodes) - 1:
158 next_node = striped_nodes[index + 1]
159 if (
160 isinstance(next_node, WikiNode)
161 and next_node.kind == NodeKind.HTML
162 and next_node.tag == "span"
163 and "gender" in next_node.attrs.get("class", "")
164 ):
165 gender = clean_node(wxr, None, next_node)
166 if gender in TEMPLATE_TAG_ARGS: 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true
167 form_tags.append(TEMPLATE_TAG_ARGS[gender])
168 else:
169 raw_form_tags.append(gender)
171 for f_str in form.split("/"):
172 f_str = f_str.strip()
173 if f_str == "": 173 ↛ 174line 173 didn't jump to line 174 because the condition on line 173 was never true
174 continue
175 form_data = Form(
176 form=f_str,
177 raw_tags=raw_form_tags,
178 tags=form_tags,
179 ruby=ruby_data,
180 )
181 translate_raw_tags(form_data)
182 word_entry.forms.append(form_data)
183 elif (
184 node.tag == "span"
185 and "tr" in node.attrs.get("class", "")
186 and len(word_entry.forms) > 0
187 ):
188 # romanization of the previous form <b> tag
189 word_entry.forms[-1].roman = clean_node(wxr, None, node)
190 elif node.tag == "sup" and lang_code == "ja":
191 extract_historical_kana(wxr, word_entry, node)
192 else:
193 tag_nodes.append(node)
194 else:
195 tag_nodes.append(node)
197 if not has_forms:
198 tags_list = extract_headword_tags(
199 clean_node(wxr, word_entry, tag_nodes).strip("() ")
200 )
201 if len(tags_list) > 0:
202 word_entry.raw_tags.extend(tags_list)
203 translate_raw_tags(word_entry)
206def extract_headword_tags(tags_str: str) -> list[str]:
207 tags = []
208 for tag_str in filter(
209 None, (s.strip() for s in re.split("&|或|和", tags_str))
210 ):
211 tags.append(tag_str)
212 return tags
215def extract_historical_kana(
216 wxr: WiktextractContext, word_entry: WordEntry, sup_node: HTMLNode
217) -> None:
218 # https://zh.wiktionary.org/wiki/Template:ja-adj
219 # "hist" parameter
220 form = ""
221 roman = ""
222 for strong_node in sup_node.find_html("strong"):
223 form = clean_node(wxr, None, strong_node)
224 for span_node in sup_node.find_html(
225 "span", attr_name="class", attr_value="tr"
226 ):
227 roman = clean_node(wxr, None, span_node).strip("()")
228 if len(form) > 0: 228 ↛ exitline 228 didn't return from function 'extract_historical_kana' because the condition on line 228 was always true
229 form_data = Form(form=form, roman=roman)
230 word_entry.forms.append(form_data)
233def extract_tlb_template(
234 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
235) -> None:
236 # https://zh.wiktionary.org/wiki/Template:Tlb
237 # https://en.wiktionary.org/wiki/Template:term-label
238 expanded_node = wxr.wtp.parse(
239 wxr.wtp.node_to_wikitext(t_node), expand_all=True
240 )
241 for span_tag in expanded_node.find_html_recursively(
242 "span", attr_name="class", attr_value="ib-content"
243 ):
244 for raw_tag in clean_node(wxr, None, span_tag).split(","):
245 raw_tag = raw_tag.strip()
246 if len(raw_tag) > 0: 246 ↛ 244line 246 didn't jump to line 244 because the condition on line 246 was always true
247 word_entry.raw_tags.append(raw_tag)
248 clean_node(wxr, word_entry, expanded_node)
249 translate_raw_tags(word_entry)