Coverage for src / wiktextract / extractor / pl / sound.py: 75%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from functools import partial 

3from itertools import chain, count 

4 

5from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from ..share import set_sound_file_url_fields 

10from .models import Hyphenation, Sound, WordEntry 

11from .tags import translate_raw_tags 

12 

13SOUND_TAG_TEMPLATES = frozenset(["RP", "amer", "lp", "lm"]) 

14 

15 

16def extract_sound_section( 

17 wxr: WiktextractContext, 

18 base_data: WordEntry, 

19 level_node: WikiNode, 

20) -> None: 

21 has_list = False 

22 sense_index = "" 

23 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

24 has_list = True 

25 raw_tags = [] 

26 for node in list_item.children: 

27 if isinstance(node, TemplateNode): 

28 process_sound_template( 

29 wxr, base_data, node, raw_tags, sense_index 

30 ) 

31 elif isinstance(node, str): 

32 m = re.search(r"\(([\d\s,-.]+)\)", node) 

33 if m is not None: 

34 sense_index = m.group(1) 

35 

36 if not has_list: 

37 # could have preformatted node, can't use `find_child()` 

38 for template_node in level_node.find_child_recursively( 

39 NodeKind.TEMPLATE 

40 ): 

41 process_sound_template( 

42 wxr, base_data, template_node, [], sense_index 

43 ) 

44 

45 

46def process_sound_template( 

47 wxr: WiktextractContext, 

48 base_data: WordEntry, 

49 template_node: TemplateNode, 

50 raw_tags: list[str], 

51 sense_index: str, 

52) -> None: 

53 if template_node.template_name.startswith(("IPA", "AS", "SAMPA")): 

54 ipa = clean_node( 

55 wxr, None, template_node.template_parameters.get(1, "") 

56 ) 

57 if isinstance(ipa, str) and len(ipa) > 0: 57 ↛ exitline 57 didn't return from function 'process_sound_template' because the condition on line 57 was always true

58 sound = Sound(ipa=ipa, raw_tags=raw_tags, sense_index=sense_index) 

59 if template_node.template_name.startswith("AS"): 

60 sound.tags.append("Slavic-alphabet") 

61 elif template_node.template_name == "SAMPA": 

62 sound.tags.append("SAMPA") 

63 translate_raw_tags(sound) 

64 base_data.sounds.append(sound) 

65 elif template_node.template_name.startswith("audio"): 

66 audio_file = template_node.template_parameters.get(1, "") 

67 if isinstance(audio_file, str) and len(audio_file) > 0: 67 ↛ exitline 67 didn't return from function 'process_sound_template' because the condition on line 67 was always true

68 sound = Sound(raw_tags=raw_tags, sense_index=sense_index) 

69 set_sound_file_url_fields(wxr, audio_file, sound) 

70 translate_raw_tags(sound) 

71 base_data.sounds.append(sound) 

72 raw_tags.clear() 

73 elif template_node.template_name in SOUND_TAG_TEMPLATES: 

74 raw_tags.append(clean_node(wxr, None, template_node)) 

75 elif template_node.template_name in ("pinyin", "zhuyin"): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 zh_pron = template_node.template_parameters.get(1, "") 

77 if isinstance(zh_pron, str) and len(zh_pron) > 0: 

78 sound = Sound( 

79 zh_pron=zh_pron, raw_tags=raw_tags, sense_index=sense_index 

80 ) 

81 if template_node.template_name == "pinyin": 

82 sound.tags.append("Pinyin") 

83 elif template_node.template_name == "zhuyin": 

84 sound.tags.append("Bopomofo") 

85 translate_raw_tags(sound) 

86 base_data.sounds.append(sound) 

87 elif template_node.template_name == "dzielenie": 

88 extract_dzielenie_template(wxr, base_data, template_node) 

89 elif template_node.template_name == "homofony": 89 ↛ exitline 89 didn't return from function 'process_sound_template' because the condition on line 89 was always true

90 extract_homofony_template(wxr, base_data, template_node, sense_index) 

91 

92 

93def extract_morphology_section( 

94 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

95) -> None: 

96 # "preformatted" node 

97 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

98 if t_node.template_name == "morfeo": 

99 h_str = clean_node(wxr, base_data, t_node) 

100 if h_str != "": 

101 base_data.hyphenations.append( 

102 Hyphenation( 

103 parts=list( 

104 chain.from_iterable( 

105 map(partial(str.split, sep="•"), h_str.split()) 

106 ) 

107 ) 

108 ) 

109 ) 

110 

111 

112def extract_dzielenie_template( 

113 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

114): 

115 expanded_str = clean_node(wxr, base_data, t_node) 

116 h_str = expanded_str[expanded_str.find(":") + 1 :].strip() 

117 base_data.hyphenations.append( 

118 Hyphenation( 

119 parts=list( 

120 chain.from_iterable( 

121 map(partial(str.split, sep="•"), h_str.split()) 

122 ) 

123 ) 

124 ) 

125 ) 

126 

127 

128def extract_homofony_template( 

129 wxr: WiktextractContext, 

130 base_data: WordEntry, 

131 t_node: TemplateNode, 

132 sense_index: str, 

133): 

134 for arg in count(1): 134 ↛ exitline 134 didn't return from function 'extract_homofony_template' because the loop on line 134 didn't complete

135 if arg not in t_node.template_parameters: 

136 break 

137 word = clean_node(wxr, None, t_node.template_parameters[arg]) 

138 if word != "": 138 ↛ 134line 138 didn't jump to line 134 because the condition on line 138 was always true

139 base_data.sounds.append( 

140 Sound(homophone=word, sense_index=sense_index) 

141 )