Coverage for src/wiktextract/extractor/ja/example.py: 74%
98 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..ruby import extract_ruby
6from ..share import calculate_bold_offsets
7from .linkage import (
8 LINKAGE_TEMPLATES,
9 extract_gloss_list_linkage_template,
10 process_linkage_list_item,
11)
12from .models import Example, Sense, WordEntry
13from .section_titles import LINKAGES
16def extract_example_list_item(
17 wxr: WiktextractContext,
18 word_entry: WordEntry,
19 sense: Sense,
20 list_item: WikiNode,
21 parent_list_text: str = "",
22) -> None:
23 # https://ja.wiktionary.org/wiki/Wiktionary:用例#用例を示す形式
25 # check if it's linkage data
26 for node_idx, node in enumerate(list_item.children):
27 if isinstance(node, str) and ":" in node:
28 linkage_type_text = clean_node(
29 wxr, None, list_item.children[:node_idx]
30 )
31 if linkage_type_text in LINKAGES: 31 ↛ 26line 31 didn't jump to line 26 because the condition on line 31 was always true
32 process_linkage_list_item(
33 wxr,
34 word_entry,
35 list_item,
36 "",
37 sense.glosses[0] if len(sense.glosses) > 0 else "",
38 )
39 return
40 elif (
41 isinstance(node, TemplateNode)
42 and node.template_name in LINKAGE_TEMPLATES
43 ):
44 extract_gloss_list_linkage_template(wxr, word_entry, node)
45 return
47 if any(
48 child.contain_node(NodeKind.BOLD) or child.kind == NodeKind.BOLD
49 for child in list_item.children
50 if isinstance(child, WikiNode) and child.kind != NodeKind.LIST
51 ) or not list_item.contain_node(NodeKind.LIST):
52 # has bold node or doesn't have list child node
53 has_example_template = False
54 for t_node in list_item.find_child(NodeKind.TEMPLATE):
55 if t_node.template_name in ["ux", "uxi"]:
56 process_ux_template(wxr, t_node, sense)
57 has_example_template = True
58 elif t_node.template_name in ["quote", "quote-book"]: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 extract_quote_template(wxr, t_node, sense)
60 has_example_template = True
61 if has_example_template:
62 return
63 for bold_index, bold_node in list_item.find_child(NodeKind.BOLD, True):
64 bold_text = clean_node(wxr, None, bold_node)
65 if bold_text == "注.":
66 note = clean_node(
67 wxr, None, list_item.children[bold_index + 1 :]
68 ).lstrip(": ")
69 if note != "": 69 ↛ 71line 69 didn't jump to line 71 because the condition on line 69 was always true
70 sense.notes.append(note)
71 return
73 expanded_nodes = wxr.wtp.parse(
74 wxr.wtp.node_to_wikitext(
75 list(list_item.invert_find_child(NodeKind.LIST))
76 ),
77 expand_all=True,
78 )
79 ruby, no_ruby = extract_ruby(wxr, expanded_nodes.children)
80 example = Example(text=clean_node(wxr, None, no_ruby), ruby=ruby)
81 calculate_bold_offsets(
82 wxr,
83 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)),
84 example.text,
85 example,
86 "bold_text_offsets",
87 )
88 for tr_list_item in list_item.find_child_recursively(
89 NodeKind.LIST_ITEM
90 ):
91 example.translation = clean_node(wxr, None, tr_list_item.children)
92 if len(parent_list_text) > 0:
93 example.ref = parent_list_text
94 else:
95 for ref_start_str in ["(", "――"]:
96 if ref_start_str in example.text:
97 ref_start = example.text.rindex(ref_start_str)
98 example.ref = example.text[ref_start:]
99 example.text = example.text[:ref_start].strip()
100 for ref_tag in expanded_nodes.find_html_recursively("ref"):
101 example.ref += " " + clean_node(
102 wxr, None, ref_tag.children
103 )
104 break
105 sense.examples.append(example)
106 else:
107 list_item_text = clean_node(
108 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))
109 )
110 for ref_tag in list_item.find_html("ref"):
111 list_item_text += " " + clean_node(wxr, None, ref_tag.children)
112 for next_list_item in list_item.find_child_recursively(
113 NodeKind.LIST_ITEM
114 ):
115 extract_example_list_item(
116 wxr, word_entry, sense, next_list_item, list_item_text
117 )
120def process_ux_template(
121 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
122) -> None:
123 # https://ja.wiktionary.org/wiki/テンプレート:ux
124 # https://ja.wiktionary.org/wiki/テンプレート:uxi
125 example = Example()
126 expanded_node = wxr.wtp.parse(
127 wxr.wtp.node_to_wikitext(t_node), expand_all=True
128 )
129 for i_tag in expanded_node.find_html_recursively("i"):
130 i_tag_class = i_tag.attrs.get("class", "")
131 if "e-example" in i_tag_class:
132 example.text = clean_node(wxr, None, i_tag)
133 calculate_bold_offsets(
134 wxr, i_tag, example.text, example, "bold_text_offsets"
135 )
136 elif "e-transliteration" in i_tag_class: 136 ↛ 129line 136 didn't jump to line 129 because the condition on line 136 was always true
137 example.roman = clean_node(wxr, None, i_tag)
138 calculate_bold_offsets(
139 wxr, i_tag, example.roman, example, "bold_roman_offsets"
140 )
141 for span_tag in expanded_node.find_html_recursively("span"):
142 span_tag_class = span_tag.attrs.get("class", "")
143 if "e-translation" in span_tag_class: 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true
144 example.translation = clean_node(wxr, None, span_tag)
145 calculate_bold_offsets(
146 wxr,
147 span_tag,
148 example.translation,
149 example,
150 "bold_translation_offsets",
151 )
152 if example.text != "": 152 ↛ 154line 152 didn't jump to line 154 because the condition on line 152 was always true
153 sense.examples.append(example)
154 clean_node(wxr, sense, t_node)
157def extract_quote_template(
158 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
159) -> None:
160 # https://ja.wiktionary.org/wiki/テンプレート:quote
161 example = Example()
162 expanded_node = wxr.wtp.parse(
163 wxr.wtp.node_to_wikitext(t_node), expand_all=True
164 )
165 for span_tag in expanded_node.find_html_recursively("span"):
166 span_tag_class = span_tag.attrs.get("class", "")
167 if " e-quotation" in span_tag_class:
168 example.text = clean_node(wxr, None, span_tag)
169 calculate_bold_offsets(
170 wxr,
171 span_tag,
172 example.text,
173 example,
174 "bold_text_offsets",
175 )
176 elif "e-transliteration" in span_tag_class:
177 example.roman = clean_node(wxr, None, span_tag)
178 calculate_bold_offsets(
179 wxr,
180 span_tag,
181 example.roman,
182 example,
183 "bold_roman_offsets",
184 )
185 elif "e-translation" in span_tag_class:
186 example.translation = clean_node(wxr, None, span_tag)
187 calculate_bold_offsets(
188 wxr,
189 span_tag,
190 example.translation,
191 example,
192 "bold_translation_offsets",
193 )
194 elif "cited-source" in span_tag_class:
195 example.ref = clean_node(wxr, None, span_tag)
197 for ref_tag in expanded_node.find_html_recursively("ref"):
198 example.ref = clean_node(wxr, None, ref_tag.children)
200 if example.text != "":
201 sense.examples.append(example)
202 clean_node(wxr, sense, t_node)