Coverage for src/wiktextract/extractor/de/pronunciation.py: 55%
90 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from typing import Union
3from mediawiki_langcodes import code_to_name
4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import create_audio_url_dict
9from .models import Sound, WordEntry
12def extract_pronunciation(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 level_node: LevelNode,
16):
17 for list_node in level_node.find_child(NodeKind.LIST):
18 sound_data: list[Sound] = [Sound()]
20 for not_list_item_node in list_node.invert_find_child(
21 NodeKind.LIST_ITEM
22 ):
23 wxr.wtp.debug(
24 f"Found unexpected non-list-item node in pronunciation "
25 f"section: {not_list_item_node}",
26 sortid="extractor/de/pronunciation/extract_pronunciation/28",
27 )
29 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
30 children = list(list_item_node.filter_empty_str_child())
31 if len(children) == 0:
32 continue
34 head_template, rest = children[0], children[1:]
35 if (
36 not isinstance(head_template, WikiNode)
37 or head_template.kind != NodeKind.TEMPLATE
38 or not rest
39 ):
40 wxr.wtp.debug(
41 f"Found unexpected non-template node in pronunciation "
42 f"section: {head_template}",
43 sortid="extractor/de/pronunciation/43",
44 )
45 continue
46 if head_template.template_name == "IPA":
47 process_ipa(wxr, sound_data, rest)
48 elif head_template.template_name == "Hörbeispiele":
49 sound_data.append(Sound())
50 process_hoerbeispiele(wxr, sound_data, rest)
51 elif head_template.template_name == "Reime":
52 process_rhymes(wxr, sound_data, rest, word_entry)
53 else:
54 wxr.wtp.debug(
55 "Unexpected template in pronunciation section: "
56 f"{head_template} with content {rest}",
57 sortid="extractor/de/pronunciation/58)",
58 )
60 # Remove empty entries
61 sound_data = [
62 entry
63 for entry in sound_data
64 if entry.model_dump(exclude_defaults=True) != {}
65 ]
66 if len(sound_data) > 0:
67 word_entry.sounds.extend(sound_data)
69 for non_list_node in level_node.invert_find_child(NodeKind.LIST):
70 wxr.wtp.debug(
71 "Unexpected non-list node in pronunciation section: "
72 f"{non_list_node}",
73 sortid="extractor/de/pronunciation/extract_pronunciation/64",
74 )
77def process_ipa(
78 wxr: WiktextractContext,
79 sound_data: list[Sound],
80 nodes: list[Union[WikiNode, str]],
81):
82 for node in nodes:
83 if is_template_node_with_name(node, "Lautschrift"):
84 process_lautschrift_template(wxr, sound_data, node)
85 elif is_tag_node(node):
86 append_tag(wxr, sound_data[-1], node)
87 elif is_new_sound_data_entry_sep(node): 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true
88 sound_data.append(Sound())
89 else:
90 wxr.wtp.debug(
91 f"Found unexpected non-Lautschrift node in IPA section: {node}",
92 sortid="extractor/de/pronunciation/process_ipa/57",
93 )
96def process_lautschrift_template(
97 wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
98) -> None:
99 template_parameters = node.template_parameters
101 ipa = template_parameters.get(1, "")
103 lang_code = template_parameters.get("spr")
104 if lang_code:
105 lang = code_to_name(lang_code, "de")
106 new_data = {
107 "lang_code": lang_code,
108 "lang": lang,
109 }
110 else:
111 new_data = dict()
113 new_data["ipa"] = ipa
115 add_sound_data_without_appending_to_existing_properties(
116 wxr,
117 sound_data,
118 new_data,
119 )
122def process_hoerbeispiele(
123 wxr: WiktextractContext,
124 sound_data: list[Sound],
125 nodes: list[Union[str, WikiNode]],
126):
127 for node in nodes:
128 if is_template_node_with_name(node, "Audio"):
129 process_audio_template(wxr, sound_data, node)
130 elif is_tag_node(node):
131 append_tag(wxr, sound_data[-1], node)
132 elif is_new_sound_data_entry_sep(node): 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was always true
133 sound_data.append(Sound())
134 else:
135 wxr.wtp.debug(
136 f"Found unexpected node in Hoerbeispiele section: {node}",
137 sortid="extractor/de/pronunciation/process_hoerbeispiele/193",
138 )
141def process_audio_template(
142 wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode
143):
144 audio_file = node.template_parameters.get(1, "").strip()
145 if len(audio_file) > 0: 145 ↛ exitline 145 didn't return from function 'process_audio_template' because the condition on line 145 was always true
146 add_sound_data_without_appending_to_existing_properties(
147 wxr, sound_data, create_audio_url_dict(audio_file)
148 )
151def process_rhymes(
152 wxr: WiktextractContext,
153 sound_data: list[Sound],
154 nodes: list[WikiNode],
155 word_entry: WordEntry,
156):
157 for node in nodes:
158 if isinstance(node, TemplateNode) and node.template_name == "Reim":
159 # https://de.wiktionary.org/wiki/Vorlage:Reime
160 rhyme = clean_node(wxr, word_entry, node)
161 if rhyme != "":
162 sound_data.append(Sound(rhymes=rhyme))
165def is_template_node_with_name(node: Union[WikiNode, str], template_name: str):
166 return (
167 isinstance(node, WikiNode)
168 and node.kind == NodeKind.TEMPLATE
169 and node.template_name == template_name
170 )
173def add_sound_data_without_appending_to_existing_properties(
174 wxr: WiktextractContext,
175 sound_data: list[Sound],
176 new_sound_data: dict,
177):
178 """Creates a new IPA data entry if properties exist in previous entry."""
179 if any(
180 [
181 key in sound_data[-1].model_dump(exclude_defaults=True)
182 for key in new_sound_data.keys()
183 ]
184 ):
185 sound_data.append(Sound())
187 for key, value in new_sound_data.items():
188 if key in sound_data[-1].model_fields: 188 ↛ 194line 188 didn't jump to line 194 because the condition on line 188 was always true
189 if isinstance(value, str): 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was always true
190 setattr(sound_data[-1], key, value)
191 else:
192 getattr(sound_data[-1], key).extend(value)
193 else:
194 wxr.wtp.debug(
195 f"Unexpected key {key} for Sound",
196 sortid="extractor/de/pronunciation/196",
197 )
200def is_tag_node(node: Union[WikiNode, str]):
201 return isinstance(node, WikiNode) and node.kind in [
202 NodeKind.TEMPLATE,
203 NodeKind.ITALIC,
204 ]
207def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode):
208 tag = clean_node(wxr, None, node)
209 if tag != "": 209 ↛ exitline 209 didn't return from function 'append_tag' because the condition on line 209 was always true
210 sound_data.raw_tags.append(tag)
213def is_new_sound_data_entry_sep(node: Union[WikiNode, str]):
214 return isinstance(node, str) and node.strip() in [",", ";"]