Coverage for src/wiktextract/extractor/es/page.py: 78%

111 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-09 23:59 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 NodeKind, 

4 TemplateNode, 

5 WikiNode, 

6) 

7 

8from ...page import clean_node 

9from ...wxr_context import WiktextractContext 

10from ...wxr_logging import logger 

11from .conjugation import extract_conjugation_section 

12from .etymology import extract_etymology_section 

13from .linkage import ( 

14 extract_additional_information_section, 

15 extract_alt_form_section, 

16 extract_linkage_section, 

17) 

18from .models import Sense, WordEntry 

19from .pos import extract_pos_section 

20from .pronunciation import process_pron_graf_template 

21from .section_titles import ( 

22 IGNORED_TITLES, 

23 LINKAGE_TITLES, 

24 POS_TITLES, 

25 TRANSLATIONS_TITLES, 

26) 

27from .translation import extract_translation_section 

28 

29 

30def parse_section( 

31 wxr: WiktextractContext, 

32 page_data: list[WordEntry], 

33 base_data: WordEntry, 

34 level_node: WikiNode, 

35) -> None: 

36 """ 

37 Parses indidividual sibling sections of an entry, 

38 e.g. https://es.wiktionary.org/wiki/amor: 

39 

40 === Etimología === 

41 === {{sustantivo masculino|es}} === 

42 === Locuciones === 

43 """ 

44 

45 categories = {} 

46 section_title = clean_node(wxr, categories, level_node.largs) 

47 original_section_title = section_title 

48 section_title = section_title.lower() 

49 wxr.wtp.start_subsection(original_section_title) 

50 if section_title == "": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 return None 

52 

53 pos_template_name = "" 

54 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

55 pos_template_name = level_node_template.template_name 

56 break 

57 

58 pos_keys = [ 

59 section_title, 

60 pos_template_name, 

61 " ".join(section_title.split()[:2]), 

62 section_title.split()[0], 

63 ] 

64 if section_title in IGNORED_TITLES: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 pass 

66 elif any(key in POS_TITLES for key in pos_keys): 

67 pos_data = None 

68 for key in pos_keys: 68 ↛ 72line 68 didn't jump to line 72 because the loop on line 68 didn't complete

69 pos_data = POS_TITLES.get(key) 

70 if pos_data is not None: 

71 break 

72 if pos_data is not None: 72 ↛ 137line 72 didn't jump to line 137 because the condition on line 72 was always true

73 pos_type = pos_data["pos"] 

74 page_data.append(base_data.model_copy(deep=True)) 

75 page_data[-1].pos = pos_type 

76 page_data[-1].pos_title = original_section_title 

77 page_data[-1].tags.extend(pos_data.get("tags", [])) 

78 page_data[-1].categories.extend(categories.get("categories", [])) 

79 extract_pos_section(wxr, page_data[-1], level_node, section_title) 

80 if len(page_data[-1].senses) == 0: 

81 if "form-of" in page_data[-1].tags: 

82 page_data.pop() 

83 elif section_title in LINKAGE_TITLES: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 page_data.pop() 

85 extract_linkage_section( 

86 wxr, 

87 page_data, 

88 level_node, 

89 LINKAGE_TITLES[section_title], 

90 ) 

91 elif ( 

92 section_title.startswith("etimología") 

93 and wxr.config.capture_etymologies 

94 ): 

95 if level_node.contain_node(LEVEL_KIND_FLAGS): 

96 base_data = base_data.model_copy(deep=True) 

97 extract_etymology_section(wxr, base_data, level_node) 

98 elif ( 

99 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations 

100 ): 

101 if len(page_data) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 page_data.append(base_data.model_copy(deep=True)) 

103 extract_translation_section(wxr, page_data, level_node) 

104 elif section_title == "descendientes": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 if len(page_data) == 0: 

106 page_data.append(base_data.model_copy(deep=True)) 

107 extract_translation_section(wxr, page_data, level_node, False) 

108 elif ( 108 ↛ 112line 108 didn't jump to line 112 because the condition on line 108 was never true

109 section_title in LINKAGE_TITLES 

110 or section_title.removesuffix("s") in LINKAGE_TITLES 

111 ): 

112 if section_title not in LINKAGE_TITLES: 

113 section_title = section_title.removesuffix("s") 

114 if len(page_data) == 0: 

115 page_data.append(base_data.model_copy(deep=True)) 

116 extract_linkage_section( 

117 wxr, page_data, level_node, LINKAGE_TITLES[section_title] 

118 ) 

119 elif section_title == "conjugación": 

120 if len(page_data) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 page_data.append(base_data.model_copy(deep=True)) 

122 extract_conjugation_section(wxr, page_data, level_node) 

123 elif section_title == "formas alternativas": 

124 extract_alt_form_section( 

125 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

126 ) 

127 elif section_title == "información adicional": 127 ↛ 132line 127 didn't jump to line 132 because the condition on line 127 was always true

128 extract_additional_information_section( 

129 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

130 ) 

131 else: 

132 wxr.wtp.debug( 

133 f"Unprocessed section: {section_title}", 

134 sortid="extractor/es/page/parse_section/48", 

135 ) 

136 

137 for link_node in level_node.find_child(NodeKind.LINK): 

138 clean_node( 

139 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

140 ) 

141 

142 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

143 parse_section(wxr, page_data, base_data, next_level_node) 

144 

145 

146def parse_page( 

147 wxr: WiktextractContext, page_title: str, page_text: str 

148) -> list[dict[str, any]]: 

149 # style guide 

150 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo 

151 # entry layout 

152 # https://es.wiktionary.org/wiki/Wikcionario:Estructura 

153 if wxr.config.verbose: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 logger.info(f"Parsing page: {page_title}") 

155 wxr.wtp.start_page(page_title) 

156 tree = wxr.wtp.parse(page_text) 

157 page_data: list[WordEntry] = [] 

158 for level2_node in tree.find_child(NodeKind.LEVEL2): 

159 categories = {} 

160 lang_code = "unknown" 

161 lang_name = "unknown" 

162 section_title = clean_node(wxr, None, level2_node.largs) 

163 if section_title.lower() == "referencias y notas": 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue 

165 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 165 ↛ 172line 165 didn't jump to line 172 because the loop on line 165 didn't complete

166 # https://es.wiktionary.org/wiki/Plantilla:lengua 

167 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma 

168 if subtitle_template.template_name == "lengua": 168 ↛ 165line 168 didn't jump to line 165 because the condition on line 168 was always true

169 lang_code = subtitle_template.template_parameters.get(1).lower() 

170 lang_name = clean_node(wxr, categories, subtitle_template) 

171 break 

172 if ( 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was never true

173 wxr.config.capture_language_codes is not None 

174 and lang_code not in wxr.config.capture_language_codes 

175 ): 

176 continue 

177 wxr.wtp.start_section(lang_name) 

178 base_data = WordEntry( 

179 lang=lang_name, 

180 lang_code=lang_code, 

181 word=page_title, 

182 pos="unknown", 

183 categories=categories.get("categories", []), 

184 ) 

185 for node in level2_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

186 if ( 

187 isinstance(node, TemplateNode) 

188 and node.template_name == "pron-graf" 

189 ): 

190 process_pron_graf_template(wxr, base_data, node) 

191 elif node.kind == NodeKind.LINK: 191 ↛ 185line 191 didn't jump to line 185 because the condition on line 191 was always true

192 clean_node(wxr, base_data, node) 

193 

194 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

195 parse_section(wxr, page_data, base_data, next_level_node) 

196 

197 for data in page_data: 

198 if len(data.senses) == 0: 

199 data.senses.append(Sense(tags=["no-gloss"])) 

200 return [d.model_dump(exclude_defaults=True) for d in page_data]