Coverage for src/wiktextract/extractor/ja/sound.py: 90%
122 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import itertools
3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import set_sound_file_url_fields
8from .models import Sound, WordEntry
9from .tags import translate_raw_tags
12def extract_sound_section(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 base_data: WordEntry,
16 level_node: LevelNode,
17) -> None:
18 sounds = []
19 cats = {}
20 if base_data.lang_code == "zh":
21 extract_zh_sounds(wxr, level_node, sounds)
22 else:
23 for template_node in level_node.find_child_recursively(
24 NodeKind.TEMPLATE
25 ):
26 process_sound_template(wxr, template_node, sounds, cats)
28 if level_node.kind == NodeKind.LEVEL3:
29 base_data.sounds.extend(sounds)
30 base_data.categories.extend(cats.get("categories", []))
32 for data in page_data:
33 if data.lang_code == base_data.lang_code: 33 ↛ 32line 33 didn't jump to line 32 because the condition on line 33 was always true
34 data.sounds.extend(sounds)
35 data.categories.extend(cats.get("categories", []))
38def process_sound_template(
39 wxr: WiktextractContext,
40 template_node: TemplateNode,
41 sounds: list[Sound],
42 cats: dict[str, list[str]],
43) -> None:
44 if template_node.template_name == "音声":
45 audio_file = clean_node(
46 wxr, None, template_node.template_parameters.get(2, "")
47 )
48 if audio_file not in ["", "-"]: 48 ↛ 86line 48 didn't jump to line 86 because the condition on line 48 was always true
49 sound = Sound()
50 raw_tag = clean_node(
51 wxr, None, template_node.template_parameters.get(3, "")
52 )
53 if len(raw_tag) > 0: 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true
54 sound.raw_tags.append(raw_tag)
55 set_sound_file_url_fields(wxr, audio_file, sound)
56 sounds.append(sound)
57 elif template_node.template_name in ["IPA", "X-SAMPA"]:
58 for index in itertools.count(1): 58 ↛ 86line 58 didn't jump to line 86 because the loop on line 58 didn't complete
59 if index not in template_node.template_parameters:
60 break
61 ipa = clean_node(
62 wxr, None, template_node.template_parameters[index]
63 )
64 if len(ipa) > 0: 64 ↛ 58line 64 didn't jump to line 58 because the condition on line 64 was always true
65 sound = Sound(ipa=ipa)
66 if template_node.template_name == "X-SAMPA":
67 sound.tags.append("X-SAMPA")
68 sounds.append(sound)
69 elif template_node.template_name == "homophones":
70 homophones = []
71 for index in itertools.count(1): 71 ↛ 79line 71 didn't jump to line 79 because the loop on line 71 didn't complete
72 if index not in template_node.template_parameters:
73 break
74 homophone = clean_node(
75 wxr, None, template_node.template_parameters[index]
76 )
77 if len(homophone) > 0: 77 ↛ 71line 77 didn't jump to line 71 because the condition on line 77 was always true
78 homophones.append(homophone)
79 if len(homophones) > 0: 79 ↛ 86line 79 didn't jump to line 86 because the condition on line 79 was always true
80 sounds.append(Sound(homophones=homophones))
81 elif template_node.template_name == "ja-pron":
82 process_ja_pron_template(wxr, template_node, sounds)
83 elif template_node.template_name == "ja-accent-common": 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true
84 process_ja_accent_common_template(wxr, template_node, sounds)
86 clean_node(wxr, cats, template_node)
89JA_PRON_ACCENTS = {
90 "中高型": "Nakadaka",
91 "平板型": "Heiban",
92 "頭高型": "Atamadaka",
93 "尾高型": "Odaka",
94}
97def process_ja_pron_template(
98 wxr: WiktextractContext,
99 template_node: TemplateNode,
100 sounds: list[Sound],
101) -> None:
102 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron
103 expanded_node = wxr.wtp.parse(
104 wxr.wtp.node_to_wikitext(template_node), expand_all=True
105 )
106 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
107 if list_item.contain_node(NodeKind.TABLE): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true
108 continue
109 else:
110 sound = Sound()
111 for span_tag in list_item.find_html_recursively("span"):
112 span_classes = span_tag.attrs.get("class", "")
113 if "qualifier-content" in span_classes:
114 raw_tag = clean_node(wxr, None, span_tag)
115 if len(raw_tag) > 0: 115 ↛ 111line 115 didn't jump to line 111 because the condition on line 115 was always true
116 sound.raw_tags.append(raw_tag)
117 elif "IPA" in span_classes:
118 sound.ipa = clean_node(wxr, None, span_tag)
119 elif "Latn" in span_classes:
120 sound.roman = clean_node(wxr, None, span_tag)
121 elif "Jpan" in span_classes:
122 sound.form = clean_node(wxr, None, span_tag)
123 for link_node in list_item.find_child(NodeKind.LINK):
124 link_text = clean_node(wxr, None, link_node)
125 if link_text in JA_PRON_ACCENTS:
126 sound.tags.append(JA_PRON_ACCENTS[link_text])
127 if len(sound.model_dump(exclude_defaults=True)) > 0:
128 sounds.append(sound)
130 for arg in ["a", "audio"]:
131 audio_file = clean_node(
132 wxr, None, template_node.template_parameters.get(arg, "")
133 )
134 if len(audio_file) > 0:
135 sound = Sound()
136 set_sound_file_url_fields(wxr, audio_file, sound)
137 sounds.append(sound)
140JA_ACCENT_COMMON_TYPES = {
141 "h": "Heiban",
142 "a": "Atamadaka",
143 "n": "Nakadaka",
144 "o": "Odaka",
145}
148def process_ja_accent_common_template(
149 wxr: WiktextractContext,
150 template_node: TemplateNode,
151 sounds: list[Sound],
152) -> None:
153 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common
154 expanded_node = wxr.wtp.parse(
155 wxr.wtp.node_to_wikitext(template_node), expand_all=True
156 )
157 sound = Sound()
158 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 158 ↛ 163line 158 didn't jump to line 163 because the loop on line 158 didn't complete
159 raw_tag = clean_node(wxr, None, link_node)
160 if raw_tag != "": 160 ↛ 158line 160 didn't jump to line 158 because the condition on line 160 was always true
161 sound.raw_tags.append(raw_tag)
162 break
163 for span_tag in expanded_node.find_html_recursively("span"): 163 ↛ 168line 163 didn't jump to line 168 because the loop on line 163 didn't complete
164 span_text = clean_node(wxr, None, span_tag)
165 if len(span_text) > 0: 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true
166 sound.form = span_text
167 break
168 accent_type = clean_node(
169 wxr, None, template_node.template_parameters.get(1, "")
170 )
171 if accent_type in JA_ACCENT_COMMON_TYPES: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true
172 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])
173 if sound.form != "": 173 ↛ exitline 173 didn't return from function 'process_ja_accent_common_template' because the condition on line 173 was always true
174 sounds.append(sound)
177def extract_zh_sounds(
178 wxr: WiktextractContext, level_node: LevelNode, sounds: list[Sound]
179) -> None:
180 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
181 after_colon = False
182 tag_nodes = []
183 value_nodes = []
184 for child in list_item.children:
185 if isinstance(child, str) and ":" in child and not after_colon:
186 tag_nodes.append(child[: child.index(":")])
187 value_nodes.append(child[child.index(":") + 1 :])
188 after_colon = True
189 elif not after_colon: 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was always true
190 tag_nodes.append(child)
191 else:
192 value_nodes.append(child)
193 sound = Sound(
194 zh_pron=clean_node(wxr, None, value_nodes),
195 raw_tags=[clean_node(wxr, None, tag_nodes)],
196 )
197 translate_raw_tags(sound)
198 sounds.append(sound)