Coverage for src / wiktextract / extractor / es / pronunciation.py: 88%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Hyphenation, Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9# translate table row header to sound model field 

10PRON_GRAF_HEADER_MAP = { 

11 "rimas": "rhymes", 

12 "rima": "rhymes", 

13} 

14 

15 

16def process_pron_graf_template( 

17 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

18) -> None: 

19 # https://es.wiktionary.org/wiki/Plantilla:pron-graf 

20 # this template could create sound data without any parameter 

21 # it expands to a two columns table 

22 expanded_node = wxr.wtp.parse( 

23 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

24 ) 

25 table_nodes = list(expanded_node.find_child(NodeKind.TABLE)) 

26 if len(table_nodes) == 0: 26 ↛ 27line 26 didn't jump to line 27 because the condition on line 26 was never true

27 return 

28 table_node = table_nodes[0] 

29 for table_row in table_node.find_child(NodeKind.TABLE_ROW): 

30 table_cells = list(table_row.find_child(NodeKind.TABLE_CELL)) 

31 if len(table_cells) != 2: 

32 continue 

33 header_node, value_node = table_cells 

34 header_text = clean_node(wxr, None, header_node) 

35 value_text = clean_node(wxr, None, value_node) 

36 if header_text.endswith(" (AFI)"): # IPA 

37 process_pron_graf_ipa_cell(wxr, word_entry, value_node, header_text) 

38 elif header_text == "silabación" and value_text != "": 

39 word_entry.hyphenations.append( 

40 Hyphenation(parts=value_text.split("-")) 

41 ) 

42 elif header_text in PRON_GRAF_HEADER_MAP: 

43 sound = Sound() 

44 setattr(sound, PRON_GRAF_HEADER_MAP[header_text], value_text) 

45 word_entry.sounds.append(sound) 

46 elif ( 

47 header_text.endswith(" alternativas") or header_text == "variantes" 

48 ): 

49 process_pron_graf_link_cell( 

50 wxr, word_entry, value_node, header_text, "alternative" 

51 ) 

52 elif header_text == "homófonos": 

53 process_pron_graf_link_cell( 

54 wxr, word_entry, value_node, header_text, "homophone" 

55 ) 

56 elif header_text == "transliteraciones": 

57 process_pron_graf_text_cell(wxr, word_entry, value_node, "roman") 

58 elif header_text == "transcripciones silábicas": 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 process_pron_graf_text_cell(wxr, word_entry, value_node, "syllabic") 

60 elif value_text not in ["", "falta agregar"] and header_text != "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 word_entry.sounds.append( 

62 Sound(other=value_text, raw_tags=[header_text]) 

63 ) 

64 

65 clean_node(wxr, word_entry, expanded_node) 

66 

67 

68def process_pron_graf_ipa_cell( 

69 wxr: WiktextractContext, 

70 word_entry: WordEntry, 

71 cell_node: WikiNode, 

72 header_text: str, 

73) -> None: 

74 sound = Sound() 

75 for node in cell_node.children: 

76 if isinstance(node, str) and len(node.strip()) > 0: 

77 sound.ipa += node.strip() 

78 elif isinstance(node, HTMLNode) and node.tag == "phonos": 

79 sound_file = node.attrs.get("file", "") 

80 set_sound_file_url_fields(wxr, sound_file, sound) 

81 for small_tag in node.find_html("small"): 

82 location = clean_node(wxr, None, small_tag) 

83 sound.raw_tags.append(location) 

84 elif ( 

85 isinstance(node, HTMLNode) and node.tag == "br" and sound != Sound() 

86 ): 

87 if not header_text.startswith("pronunciación"): # location 

88 sound.raw_tags.append(header_text.removesuffix(" (AFI)")) 

89 translate_raw_tags(sound) 

90 word_entry.sounds.append(sound.model_copy(deep=True)) 

91 sound = Sound() 

92 if sound != Sound(): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 if not header_text.startswith("pronunciación"): 

94 sound.raw_tags.append(header_text.removesuffix(" (AFI)")) 

95 translate_raw_tags(sound) 

96 word_entry.sounds.append(sound) 

97 

98 

99def process_pron_graf_link_cell( 

100 wxr: WiktextractContext, 

101 word_entry: WordEntry, 

102 cell_node: WikiNode, 

103 header_text: str, 

104 field_name: str, 

105) -> None: 

106 for link_index, link_node in cell_node.find_child( 

107 NodeKind.LINK, with_index=True 

108 ): 

109 sound = Sound() 

110 setattr(sound, field_name, clean_node(wxr, None, link_node)) 

111 if ( 

112 link_index + 1 < len(cell_node.children) 

113 and isinstance(cell_node.children[link_index + 1], HTMLNode) 

114 and cell_node.children[link_index + 1].tag == "ref" 

115 ): 

116 # nest "ref" tag is note text 

117 sound.note = clean_node( 

118 wxr, None, cell_node.children[link_index + 1].children 

119 ) 

120 if header_text == "variantes": 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 sound.not_same_pronunciation = True 

122 word_entry.sounds.append(sound) 

123 

124 

125def process_pron_graf_text_cell( 

126 wxr: WiktextractContext, 

127 word_entry: WordEntry, 

128 cell_node: WikiNode, 

129 field_name: str, 

130) -> None: 

131 sound = Sound() 

132 for node_index, node in enumerate(cell_node.children): 

133 if isinstance(node, str) and len(node.strip()) > 0: 

134 node = node.strip() 

135 if node.startswith(",&nbsp;"): 

136 node = node.removeprefix(",&nbsp;") 

137 word_entry.sounds.append(sound.model_copy(deep=True)) 

138 sound = Sound() 

139 setattr(sound, field_name, node.strip()) 

140 elif isinstance(node, HTMLNode) and node.tag == "ref": 

141 sound.note = clean_node(wxr, None, node.children) 

142 if len(getattr(sound, field_name)) > 0: 142 ↛ exitline 142 didn't return from function 'process_pron_graf_text_cell' because the condition on line 142 was always true

143 word_entry.sounds.append(sound)