Coverage for src/wiktextract/extractor/zh/headword_line.py: 91%
113 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Union
4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from ..share import strip_nodes
10from .models import Form, WordEntry
11from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
14def extract_headword_line_template(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 node: TemplateNode,
18 lang_code: str,
19) -> None:
20 # handle the first template in header line
21 template_name = node.template_name
22 if ( 22 ↛ 26line 22 didn't jump to line 26 because the condition on line 22 was never true
23 template_name != "head"
24 and not template_name.startswith(f"{lang_code}-")
25 ) or template_name.endswith("-see"):
26 return
28 expanded_node = wxr.wtp.parse(
29 wxr.wtp.node_to_wikitext(node), expand_all=True
30 )
31 clean_node(wxr, page_data[-1], expanded_node)
32 forms_start_index = 0
33 for span_node in expanded_node.find_html(
34 "span", attr_name="class", attr_value="headword-line"
35 ):
36 for index, span_child in span_node.find_child(NodeKind.HTML, True):
37 if span_child.tag == "span":
38 forms_start_index = index + 1
39 class_names = span_child.attrs.get("class", "")
40 if "headword-tr" in class_names:
41 page_data[-1].forms.append(
42 Form(
43 form=clean_node(wxr, page_data[-1], span_child),
44 tags=["romanization"],
45 )
46 )
47 elif "gender" in class_names:
48 for abbr_tag in span_child.find_html("abbr"):
49 gender = abbr_tag.children[0]
50 if gender in TEMPLATE_TAG_ARGS: 50 ↛ 53line 50 didn't jump to line 53 because the condition on line 50 was always true
51 page_data[-1].tags.append(TEMPLATE_TAG_ARGS[gender])
52 else:
53 page_data[-1].raw_tags.append(gender)
54 translate_raw_tags(page_data[-1])
55 else:
56 for strong_node in span_child.find_html(
57 "strong", attr_name="class", attr_value="headword"
58 ):
59 process_ja_headword(wxr, page_data, strong_node)
60 elif (
61 span_child.tag == "strong"
62 and "headword" in span_child.attrs.get("class", "")
63 ):
64 forms_start_index = index + 1
65 if lang_code == "ja": 65 ↛ 66line 65 didn't jump to line 66 because the condition on line 65 was never true
66 process_ja_headword(wxr, page_data, span_child)
67 elif span_child.tag == "b":
68 # this is a form <b> tag, already inside form parentheses
69 break
71 extract_headword_forms(
72 wxr, page_data, span_node.children[forms_start_index:]
73 )
76def process_ja_headword(
77 wxr: WiktextractContext,
78 page_data: list[WordEntry],
79 strong_node: HTMLNode,
80) -> None:
81 ruby_data, node_without_ruby = extract_ruby(wxr, strong_node)
82 form = clean_node(wxr, page_data[-1], node_without_ruby)
83 if (len(ruby_data) > 0 or form != page_data[-1].word) and len(form) > 0: 83 ↛ exitline 83 didn't return from function 'process_ja_headword' because the condition on line 83 was always true
84 page_data[-1].forms.append(
85 Form(
86 form=clean_node(wxr, page_data[-1], node_without_ruby),
87 ruby=ruby_data,
88 tags=["canonical"],
89 )
90 )
93def extract_headword_forms(
94 wxr: WiktextractContext,
95 page_data: list[WordEntry],
96 form_nodes: list[Union[WikiNode, str]],
97) -> None:
98 current_nodes = []
99 for node in form_nodes:
100 if isinstance(node, str) and node.startswith((",", ",")):
101 process_forms_text(wxr, page_data, current_nodes)
102 current_nodes = [node[1:]]
103 else:
104 current_nodes.append(node)
106 if len(current_nodes) > 0:
107 process_forms_text(wxr, page_data, current_nodes)
110def process_forms_text(
111 wxr: WiktextractContext,
112 page_data: list[WordEntry],
113 form_nodes: list[Union[WikiNode, str]],
114) -> None:
115 tag_nodes = []
116 has_forms = False
117 striped_nodes = list(strip_nodes(form_nodes))
118 lang_code = page_data[-1].lang_code
119 for index, node in enumerate(striped_nodes):
120 if isinstance(node, WikiNode) and node.kind == NodeKind.HTML:
121 if node.tag == "b":
122 has_forms = True
123 ruby_data = []
124 if lang_code == "ja": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 ruby_data, node_without_ruby = extract_ruby(wxr, node)
126 form = clean_node(wxr, None, node_without_ruby)
127 else:
128 form = clean_node(wxr, None, node)
129 raw_form_tags = extract_headword_tags(
130 clean_node(wxr, None, tag_nodes).strip("() ")
131 )
132 form_tags = []
133 # check if next tag has gender data
134 if index < len(striped_nodes) - 1:
135 next_node = striped_nodes[index + 1]
136 if (
137 isinstance(next_node, WikiNode)
138 and next_node.kind == NodeKind.HTML
139 and next_node.tag == "span"
140 and "gender" in next_node.attrs.get("class", "")
141 ):
142 gender = clean_node(wxr, None, next_node)
143 if gender in TEMPLATE_TAG_ARGS: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true
144 form_tags.append(TEMPLATE_TAG_ARGS[gender])
145 else:
146 raw_form_tags.append(gender)
148 form_data = Form(
149 form=form,
150 raw_tags=raw_form_tags,
151 tags=form_tags,
152 ruby=ruby_data,
153 )
154 translate_raw_tags(form_data)
155 page_data[-1].forms.append(form_data)
156 elif ( 156 ↛ 162line 156 didn't jump to line 162 because the condition on line 156 was never true
157 node.tag == "span"
158 and "tr" in node.attrs.get("class", "")
159 and len(page_data[-1].forms) > 0
160 ):
161 # romanization of the previous form <b> tag
162 page_data[-1].forms[-1].roman = clean_node(wxr, None, node)
163 elif node.tag == "sup" and lang_code == "ja":
164 extract_historical_kana(wxr, page_data, node)
165 else:
166 tag_nodes.append(node)
167 else:
168 tag_nodes.append(node)
170 if not has_forms:
171 tags_list = extract_headword_tags(
172 clean_node(wxr, page_data[-1], tag_nodes).strip("() ")
173 )
174 if len(tags_list) > 0:
175 page_data[-1].raw_tags.extend(tags_list)
176 translate_raw_tags(page_data[-1])
179def extract_headword_tags(tags_str: str) -> list[str]:
180 tags = []
181 for tag_str in filter(
182 None, (s.strip() for s in re.split("&|或|和", tags_str))
183 ):
184 tags.append(tag_str)
185 return tags
188def extract_historical_kana(
189 wxr: WiktextractContext,
190 page_data: list[WordEntry],
191 sup_node: HTMLNode,
192) -> None:
193 # https://zh.wiktionary.org/wiki/Template:ja-adj
194 # "hist" parameter
195 form = ""
196 roman = ""
197 for strong_node in sup_node.find_html("strong"):
198 form = clean_node(wxr, None, strong_node)
199 for span_node in sup_node.find_html(
200 "span", attr_name="class", attr_value="tr"
201 ):
202 roman = clean_node(wxr, None, span_node).strip("()")
203 if len(form) > 0: 203 ↛ exitline 203 didn't return from function 'extract_historical_kana' because the condition on line 203 was always true
204 form_data = Form(form=form, roman=roman)
205 page_data[-1].forms.append(form_data)
208def extract_tlb_template(
209 wxr: WiktextractContext,
210 template_node: TemplateNode,
211 page_data: list[WordEntry],
212) -> None:
213 # https://zh.wiktionary.org/wiki/Template:Tlb
214 # https://en.wiktionary.org/wiki/Template:term-label
215 expanded_node = wxr.wtp.parse(
216 wxr.wtp.node_to_wikitext(template_node), expand_all=True
217 )
218 for span_tag in expanded_node.find_html_recursively(
219 "span", attr_name="class", attr_value="ib-content"
220 ):
221 raw_tag = clean_node(wxr, None, span_tag)
222 if len(raw_tag) > 0: 222 ↛ 218line 222 didn't jump to line 218 because the condition on line 222 was always true
223 page_data[-1].raw_tags.append(raw_tag)
224 clean_node(wxr, page_data[-1], expanded_node)
225 translate_raw_tags(page_data[-1])