Coverage for src / wiktextract / extractor / pl / sound.py: 75%
78 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from functools import partial
3from itertools import chain, count
5from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from ..share import set_sound_file_url_fields
10from .models import Hyphenation, Sound, WordEntry
11from .tags import translate_raw_tags
13SOUND_TAG_TEMPLATES = frozenset(["RP", "amer", "lp", "lm"])
16def extract_sound_section(
17 wxr: WiktextractContext,
18 base_data: WordEntry,
19 level_node: WikiNode,
20) -> None:
21 has_list = False
22 sense_index = ""
23 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
24 has_list = True
25 raw_tags = []
26 for node in list_item.children:
27 if isinstance(node, TemplateNode):
28 process_sound_template(
29 wxr, base_data, node, raw_tags, sense_index
30 )
31 elif isinstance(node, str):
32 m = re.search(r"\(([\d\s,-.]+)\)", node)
33 if m is not None:
34 sense_index = m.group(1)
36 if not has_list:
37 # could have preformatted node, can't use `find_child()`
38 for template_node in level_node.find_child_recursively(
39 NodeKind.TEMPLATE
40 ):
41 process_sound_template(
42 wxr, base_data, template_node, [], sense_index
43 )
46def process_sound_template(
47 wxr: WiktextractContext,
48 base_data: WordEntry,
49 template_node: TemplateNode,
50 raw_tags: list[str],
51 sense_index: str,
52) -> None:
53 if template_node.template_name.startswith(("IPA", "AS", "SAMPA")):
54 ipa = clean_node(
55 wxr, None, template_node.template_parameters.get(1, "")
56 )
57 if isinstance(ipa, str) and len(ipa) > 0: 57 ↛ exitline 57 didn't return from function 'process_sound_template' because the condition on line 57 was always true
58 sound = Sound(ipa=ipa, raw_tags=raw_tags, sense_index=sense_index)
59 if template_node.template_name.startswith("AS"):
60 sound.tags.append("Slavic-alphabet")
61 elif template_node.template_name == "SAMPA":
62 sound.tags.append("SAMPA")
63 translate_raw_tags(sound)
64 base_data.sounds.append(sound)
65 elif template_node.template_name.startswith("audio"):
66 audio_file = template_node.template_parameters.get(1, "")
67 if isinstance(audio_file, str) and len(audio_file) > 0: 67 ↛ exitline 67 didn't return from function 'process_sound_template' because the condition on line 67 was always true
68 sound = Sound(raw_tags=raw_tags, sense_index=sense_index)
69 set_sound_file_url_fields(wxr, audio_file, sound)
70 translate_raw_tags(sound)
71 base_data.sounds.append(sound)
72 raw_tags.clear()
73 elif template_node.template_name in SOUND_TAG_TEMPLATES:
74 raw_tags.append(clean_node(wxr, None, template_node))
75 elif template_node.template_name in ("pinyin", "zhuyin"): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 zh_pron = template_node.template_parameters.get(1, "")
77 if isinstance(zh_pron, str) and len(zh_pron) > 0:
78 sound = Sound(
79 zh_pron=zh_pron, raw_tags=raw_tags, sense_index=sense_index
80 )
81 if template_node.template_name == "pinyin":
82 sound.tags.append("Pinyin")
83 elif template_node.template_name == "zhuyin":
84 sound.tags.append("Bopomofo")
85 translate_raw_tags(sound)
86 base_data.sounds.append(sound)
87 elif template_node.template_name == "dzielenie":
88 extract_dzielenie_template(wxr, base_data, template_node)
89 elif template_node.template_name == "homofony": 89 ↛ exitline 89 didn't return from function 'process_sound_template' because the condition on line 89 was always true
90 extract_homofony_template(wxr, base_data, template_node, sense_index)
93def extract_morphology_section(
94 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
95) -> None:
96 # "preformatted" node
97 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
98 if t_node.template_name == "morfeo":
99 h_str = clean_node(wxr, base_data, t_node)
100 if h_str != "":
101 base_data.hyphenations.append(
102 Hyphenation(
103 parts=list(
104 chain.from_iterable(
105 map(partial(str.split, sep="•"), h_str.split())
106 )
107 )
108 )
109 )
112def extract_dzielenie_template(
113 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
114):
115 expanded_str = clean_node(wxr, base_data, t_node)
116 h_str = expanded_str[expanded_str.find(":") + 1 :].strip()
117 base_data.hyphenations.append(
118 Hyphenation(
119 parts=list(
120 chain.from_iterable(
121 map(partial(str.split, sep="•"), h_str.split())
122 )
123 )
124 )
125 )
128def extract_homofony_template(
129 wxr: WiktextractContext,
130 base_data: WordEntry,
131 t_node: TemplateNode,
132 sense_index: str,
133):
134 for arg in count(1): 134 ↛ exitline 134 didn't return from function 'extract_homofony_template' because the loop on line 134 didn't complete
135 if arg not in t_node.template_parameters:
136 break
137 word = clean_node(wxr, None, t_node.template_parameters[arg])
138 if word != "": 138 ↛ 134line 138 didn't jump to line 134 because the condition on line 138 was always true
139 base_data.sounds.append(
140 Sound(homophone=word, sense_index=sense_index)
141 )