Coverage for src/wiktextract/extractor/es/pronunciation.py: 90%

86 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import create_audio_url_dict 

6from .models import Sound, WordEntry 

7 

8# translate table row header to sound model field 

9PRON_GRAF_HEADER_MAP = { 

10 "rimas": "rhymes", 

11 "rima": "rhymes", 

12} 

13 

14 

15def process_pron_graf_template( 

16 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

17) -> None: 

18 # https://es.wiktionary.org/wiki/Plantilla:pron-graf 

19 # this template could create sound data without any parameter 

20 # it expands to a two columns table 

21 expanded_node = wxr.wtp.parse( 

22 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

23 ) 

24 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

25 if len(table_nodes) == 0: 25 ↛ 26line 25 didn't jump to line 26 because the condition on line 25 was never true

26 return 

27 table_node = table_nodes[0] 

28 extra_sounds = {} # not translated 

29 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

30 table_cells = list(table_row.find_child(NodeKind.TABLE_CELL)) 

31 if len(table_cells) != 2: 

32 continue 

33 header_node, value_node = table_cells 

34 header_text = clean_node(wxr, None, header_node) 

35 value_text = clean_node(wxr, None, value_node) 

36 if header_text.endswith(" (AFI)"): # IPA 

37 process_pron_graf_ipa_cell(wxr, word_entry, value_node, header_text) 

38 elif header_text == "silabación": 

39 word_entry.hyphenation = value_text 

40 elif header_text in PRON_GRAF_HEADER_MAP: 

41 sound = Sound() 

42 setattr(sound, PRON_GRAF_HEADER_MAP[header_text], value_text) 

43 word_entry.sounds.append(sound) 

44 elif ( 

45 header_text.endswith(" alternativas") or header_text == "variantes" 

46 ): 

47 process_pron_graf_link_cell( 

48 wxr, word_entry, value_node, header_text, "alternative" 

49 ) 

50 elif header_text == "homófonos": 

51 process_pron_graf_link_cell( 

52 wxr, word_entry, value_node, header_text, "homophone" 

53 ) 

54 elif header_text == "transliteraciones": 

55 process_pron_graf_text_cell(wxr, word_entry, value_node, "roman") 

56 elif header_text == "transcripciones silábicas": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 process_pron_graf_text_cell(wxr, word_entry, value_node, "syllabic") 

58 else: 

59 extra_sounds[header_text] = value_text 

60 

61 if len(extra_sounds) > 0: 

62 word_entry.extra_sounds = extra_sounds 

63 clean_node(wxr, word_entry, expanded_node) 

64 

65 

66def process_pron_graf_ipa_cell( 

67 wxr: WiktextractContext, 

68 word_entry: WordEntry, 

69 cell_node: WikiNode, 

70 header_text: str, 

71) -> None: 

72 sound = Sound() 

73 for node in cell_node.children: 

74 if isinstance(node, str) and len(node.strip()) > 0: 

75 sound.ipa += node.strip() 

76 elif isinstance(node, HTMLNode) and node.tag == "phonos": 

77 sound_file = node.attrs.get("file", "") 

78 sound_urls = create_audio_url_dict(sound_file) 

79 for sound_key, sound_value in sound_urls.items(): 

80 if hasattr(sound, sound_key): 80 ↛ 79line 80 didn't jump to line 79 because the condition on line 80 was always true

81 setattr(sound, sound_key, sound_value) 

82 for small_tag in node.find_html("small"): 

83 location = clean_node(wxr, None, small_tag) 

84 sound.raw_tags.append(location) 

85 elif ( 

86 isinstance(node, HTMLNode) and node.tag == "br" and sound != Sound() 

87 ): 

88 if not header_text.startswith("pronunciación"): # location 

89 sound.raw_tags.append(header_text.removesuffix(" (AFI)")) 

90 word_entry.sounds.append(sound.model_copy(deep=True)) 

91 sound = Sound() 

92 if sound != Sound(): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 if not header_text.startswith("pronunciación"): 

94 sound.raw_tags.append(header_text.removesuffix(" (AFI)")) 

95 word_entry.sounds.append(sound) 

96 

97 

98def process_pron_graf_link_cell( 

99 wxr: WiktextractContext, 

100 word_entry: WordEntry, 

101 cell_node: WikiNode, 

102 header_text: str, 

103 field_name: str, 

104) -> None: 

105 for link_index, link_node in cell_node.find_child( 

106 NodeKind.LINK, with_index=True 

107 ): 

108 sound = Sound() 

109 setattr(sound, field_name, clean_node(wxr, None, link_node)) 

110 if ( 

111 link_index + 1 < len(cell_node.children) 

112 and isinstance(cell_node.children[link_index + 1], HTMLNode) 

113 and cell_node.children[link_index + 1].tag == "ref" 

114 ): 

115 # nest "ref" tag is note text 

116 sound.note = clean_node( 

117 wxr, None, cell_node.children[link_index + 1].children 

118 ) 

119 if header_text == "variantes": 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 sound.not_same_pronunciation = True 

121 word_entry.sounds.append(sound) 

122 

123 

124def process_pron_graf_text_cell( 

125 wxr: WiktextractContext, 

126 word_entry: WordEntry, 

127 cell_node: WikiNode, 

128 field_name: str, 

129) -> None: 

130 sound = Sound() 

131 for node_index, node in enumerate(cell_node.children): 

132 if isinstance(node, str) and len(node.strip()) > 0: 

133 node = node.strip() 

134 if node.startswith(",&nbsp;"): 

135 node = node.removeprefix(",&nbsp;") 

136 word_entry.sounds.append(sound.model_copy(deep=True)) 

137 sound = Sound() 

138 setattr(sound, field_name, node.strip()) 

139 elif isinstance(node, HTMLNode) and node.tag == "ref": 

140 sound.note = clean_node(wxr, None, node.children) 

141 if len(getattr(sound, field_name)) > 0: 141 ↛ exitline 141 didn't return from function 'process_pron_graf_text_cell' because the condition on line 141 was always true

142 word_entry.sounds.append(sound)