Coverage for src/wiktextract/extractor/ja/example.py: 74%
98 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import (
8 LINKAGE_TEMPLATES,
9 extract_gloss_list_linkage_template,
10 process_linkage_list_item,
11)
12from .models import Example, Sense, WordEntry
13from .section_titles import LINKAGES
16def extract_example_list_item(
17 wxr: WiktextractContext,
18 word_entry: WordEntry,
19 sense: Sense,
20 list_item: WikiNode,
21 parent_list_text: str = "",
22) -> None:
23 # https://ja.wiktionary.org/wiki/Wiktionary:用例#用例を示す形式
25 # check if it's linkage data
26 for node_idx, node in enumerate(list_item.children):
27 if isinstance(node, str) and ":" in node:
28 linkage_type_text = clean_node(
29 wxr, None, list_item.children[:node_idx]
30 )
31 if linkage_type_text in LINKAGES: 31 ↛ 26line 31 didn't jump to line 26 because the condition on line 31 was always true
32 process_linkage_list_item(
33 wxr,
34 word_entry,
35 list_item,
36 "",
37 sense.glosses[0] if len(sense.glosses) > 0 else "",
38 )
39 return
40 elif (
41 isinstance(node, TemplateNode)
42 and node.template_name in LINKAGE_TEMPLATES
43 ):
44 extract_gloss_list_linkage_template(wxr, word_entry, node)
45 return
47 if any(
48 child.contain_node(NodeKind.BOLD) or child.kind == NodeKind.BOLD
49 for child in list_item.children
50 if isinstance(child, WikiNode) and child.kind != NodeKind.LIST
51 ) or not list_item.contain_node(NodeKind.LIST):
52 # has bold node or doesn't have list child node
53 has_example_template = False
54 for t_node in list_item.find_child(NodeKind.TEMPLATE):
55 if t_node.template_name in ["ux", "uxi"]:
56 process_ux_template(wxr, t_node, sense)
57 has_example_template = True
58 elif t_node.template_name in ["quote", "quote-book"]: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 extract_quote_template(wxr, t_node, sense)
60 has_example_template = True
61 if has_example_template:
62 return
63 for bold_index, bold_node in list_item.find_child(NodeKind.BOLD, True):
64 bold_text = clean_node(wxr, None, bold_node)
65 if bold_text == "注.":
66 note = clean_node(
67 wxr, None, list_item.children[bold_index + 1 :]
68 ).lstrip(": ")
69 if note != "": 69 ↛ 71line 69 didn't jump to line 71 because the condition on line 69 was always true
70 sense.notes.append(note)
71 return
73 expanded_nodes = wxr.wtp.parse(
74 wxr.wtp.node_to_wikitext(
75 list(
76 list_item.invert_find_child(
77 NodeKind.LIST, include_empty_str=True
78 )
79 )
80 ),
81 expand_all=True,
82 )
83 ruby, no_ruby = extract_ruby(wxr, expanded_nodes.children)
84 example = Example(text=clean_node(wxr, None, no_ruby), ruby=ruby)
85 calculate_bold_offsets(
86 wxr,
87 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)),
88 example.text,
89 example,
90 "bold_text_offsets",
91 )
92 for tr_list_item in list_item.find_child_recursively(
93 NodeKind.LIST_ITEM
94 ):
95 example.translation = clean_node(wxr, None, tr_list_item.children)
96 if len(parent_list_text) > 0:
97 example.ref = parent_list_text
98 else:
99 for ref_start_str in ["(", "――"]:
100 if ref_start_str in example.text:
101 ref_start = example.text.rindex(ref_start_str)
102 example.ref = example.text[ref_start:]
103 example.text = example.text[:ref_start].strip()
104 for ref_tag in expanded_nodes.find_html_recursively("ref"):
105 example.ref += " " + clean_node(
106 wxr, None, ref_tag.children
107 )
108 break
109 sense.examples.append(example)
110 else:
111 list_item_text = clean_node(
112 wxr,
113 None,
114 list(
115 list_item.invert_find_child(
116 NodeKind.LIST, include_empty_str=True
117 )
118 ),
119 )
120 for ref_tag in list_item.find_html("ref"):
121 list_item_text += " " + clean_node(wxr, None, ref_tag.children)
122 for next_list_item in list_item.find_child_recursively(
123 NodeKind.LIST_ITEM
124 ):
125 extract_example_list_item(
126 wxr, word_entry, sense, next_list_item, list_item_text
127 )
130def process_ux_template(
131 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
132) -> None:
133 # https://ja.wiktionary.org/wiki/テンプレート:ux
134 # https://ja.wiktionary.org/wiki/テンプレート:uxi
135 example = Example()
136 expanded_node = wxr.wtp.parse(
137 wxr.wtp.node_to_wikitext(t_node), expand_all=True
138 )
139 for i_tag in expanded_node.find_html_recursively("i"):
140 i_tag_class = i_tag.attrs.get("class", "")
141 if "e-example" in i_tag_class:
142 example.text = clean_node(wxr, None, i_tag)
143 calculate_bold_offsets(
144 wxr, i_tag, example.text, example, "bold_text_offsets"
145 )
146 elif "e-transliteration" in i_tag_class: 146 ↛ 139line 146 didn't jump to line 139 because the condition on line 146 was always true
147 example.roman = clean_node(wxr, None, i_tag)
148 calculate_bold_offsets(
149 wxr, i_tag, example.roman, example, "bold_roman_offsets"
150 )
151 for span_tag in expanded_node.find_html_recursively("span"):
152 span_tag_class = span_tag.attrs.get("class", "")
153 if "e-translation" in span_tag_class: 153 ↛ 151line 153 didn't jump to line 151 because the condition on line 153 was always true
154 example.translation = clean_node(wxr, None, span_tag)
155 calculate_bold_offsets(
156 wxr,
157 span_tag,
158 example.translation,
159 example,
160 "bold_translation_offsets",
161 )
162 if example.text != "": 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true
163 sense.examples.append(example)
164 clean_node(wxr, sense, t_node)
167def extract_quote_template(
168 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
169) -> None:
170 # https://ja.wiktionary.org/wiki/テンプレート:quote
171 example = Example()
172 expanded_node = wxr.wtp.parse(
173 wxr.wtp.node_to_wikitext(t_node), expand_all=True
174 )
175 for span_tag in expanded_node.find_html_recursively("span"):
176 span_tag_class = span_tag.attrs.get("class", "")
177 if " e-quotation" in span_tag_class:
178 example.text = clean_node(wxr, None, span_tag)
179 calculate_bold_offsets(
180 wxr,
181 span_tag,
182 example.text,
183 example,
184 "bold_text_offsets",
185 )
186 elif "e-transliteration" in span_tag_class:
187 example.roman = clean_node(wxr, None, span_tag)
188 calculate_bold_offsets(
189 wxr,
190 span_tag,
191 example.roman,
192 example,
193 "bold_roman_offsets",
194 )
195 elif "e-translation" in span_tag_class:
196 example.translation = clean_node(wxr, None, span_tag)
197 calculate_bold_offsets(
198 wxr,
199 span_tag,
200 example.translation,
201 example,
202 "bold_translation_offsets",
203 )
204 elif "cited-source" in span_tag_class:
205 example.ref = clean_node(wxr, None, span_tag)
207 for ref_tag in expanded_node.find_html_recursively("ref"):
208 example.ref = clean_node(wxr, None, ref_tag.children)
210 if example.text != "":
211 sense.examples.append(example)
212 clean_node(wxr, sense, t_node)