Coverage for src/wiktextract/extractor/es/page.py: 79%

109 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 NodeKind, 

4 TemplateNode, 

5 WikiNode, 

6) 

7 

8from ...page import clean_node 

9from ...wxr_context import WiktextractContext 

10from ...wxr_logging import logger 

11from .conjugation import extract_conjugation_section 

12from .etymology import extract_etymology_section 

13from .linkage import ( 

14 extract_additional_information_section, 

15 extract_alt_form_section, 

16 extract_linkage_section, 

17) 

18from .models import Sense, WordEntry 

19from .pos import extract_pos_section 

20from .pronunciation import process_pron_graf_template 

21from .section_titles import ( 

22 IGNORED_TITLES, 

23 LINKAGE_TITLES, 

24 POS_TITLES, 

25 TRANSLATIONS_TITLES, 

26) 

27from .translation import extract_translation_section 

28 

29 

30def parse_section( 

31 wxr: WiktextractContext, 

32 page_data: list[WordEntry], 

33 base_data: WordEntry, 

34 level_node: WikiNode, 

35) -> None: 

36 """ 

37 Parses indidividual sibling sections of an entry, 

38 e.g. https://es.wiktionary.org/wiki/amor: 

39 

40 === Etimología === 

41 === {{sustantivo masculino|es}} === 

42 === Locuciones === 

43 """ 

44 

45 categories = {} 

46 section_title = clean_node(wxr, categories, level_node.largs) 

47 original_section_title = section_title 

48 section_title = section_title.lower() 

49 wxr.wtp.start_subsection(original_section_title) 

50 

51 pos_template_name = "" 

52 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

53 pos_template_name = level_node_template.template_name 

54 break 

55 

56 pos_keys = [ 

57 section_title, 

58 pos_template_name, 

59 " ".join(section_title.split()[:2]), 

60 section_title.split()[0], 

61 ] 

62 if section_title in IGNORED_TITLES: 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 pass 

64 elif any(key in POS_TITLES for key in pos_keys): 

65 pos_data = None 

66 for key in pos_keys: 66 ↛ 70line 66 didn't jump to line 70 because the loop on line 66 didn't complete

67 pos_data = POS_TITLES.get(key) 

68 if pos_data is not None: 

69 break 

70 if pos_data is not None: 70 ↛ 135line 70 didn't jump to line 135 because the condition on line 70 was always true

71 pos_type = pos_data["pos"] 

72 page_data.append(base_data.model_copy(deep=True)) 

73 page_data[-1].pos = pos_type 

74 page_data[-1].pos_title = original_section_title 

75 page_data[-1].tags.extend(pos_data.get("tags", [])) 

76 page_data[-1].categories.extend(categories.get("categories", [])) 

77 extract_pos_section(wxr, page_data[-1], level_node, section_title) 

78 if len(page_data[-1].senses) == 0: 

79 if "form-of" in page_data[-1].tags: 

80 page_data.pop() 

81 elif section_title in LINKAGE_TITLES: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 page_data.pop() 

83 extract_linkage_section( 

84 wxr, 

85 page_data, 

86 level_node, 

87 LINKAGE_TITLES[section_title], 

88 ) 

89 elif ( 

90 section_title.startswith("etimología") 

91 and wxr.config.capture_etymologies 

92 ): 

93 if level_node.contain_node(LEVEL_KIND_FLAGS): 

94 base_data = base_data.model_copy(deep=True) 

95 extract_etymology_section(wxr, base_data, level_node) 

96 elif ( 

97 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations 

98 ): 

99 if len(page_data) == 0: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 page_data.append(base_data.model_copy(deep=True)) 

101 extract_translation_section(wxr, page_data, level_node) 

102 elif section_title == "descendientes": 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 if len(page_data) == 0: 

104 page_data.append(base_data.model_copy(deep=True)) 

105 extract_translation_section(wxr, page_data, level_node, False) 

106 elif ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true

107 section_title in LINKAGE_TITLES 

108 or section_title.removesuffix("s") in LINKAGE_TITLES 

109 ): 

110 if section_title not in LINKAGE_TITLES: 

111 section_title = section_title.removesuffix("s") 

112 if len(page_data) == 0: 

113 page_data.append(base_data.model_copy(deep=True)) 

114 extract_linkage_section( 

115 wxr, page_data, level_node, LINKAGE_TITLES[section_title] 

116 ) 

117 elif section_title == "conjugación": 

118 if len(page_data) == 0: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 page_data.append(base_data.model_copy(deep=True)) 

120 extract_conjugation_section(wxr, page_data, level_node) 

121 elif section_title == "formas alternativas": 

122 extract_alt_form_section( 

123 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

124 ) 

125 elif section_title == "información adicional": 125 ↛ 130line 125 didn't jump to line 130 because the condition on line 125 was always true

126 extract_additional_information_section( 

127 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

128 ) 

129 else: 

130 wxr.wtp.debug( 

131 f"Unprocessed section: {section_title}", 

132 sortid="extractor/es/page/parse_section/48", 

133 ) 

134 

135 for link_node in level_node.find_child(NodeKind.LINK): 

136 clean_node( 

137 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node 

138 ) 

139 

140 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

141 parse_section(wxr, page_data, base_data, next_level_node) 

142 

143 

144def parse_page( 

145 wxr: WiktextractContext, page_title: str, page_text: str 

146) -> list[dict[str, any]]: 

147 # style guide 

148 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo 

149 # entry layout 

150 # https://es.wiktionary.org/wiki/Wikcionario:Estructura 

151 if wxr.config.verbose: 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 logger.info(f"Parsing page: {page_title}") 

153 wxr.wtp.start_page(page_title) 

154 tree = wxr.wtp.parse(page_text) 

155 page_data: list[WordEntry] = [] 

156 for level2_node in tree.find_child(NodeKind.LEVEL2): 

157 categories = {} 

158 lang_code = "unknown" 

159 lang_name = "unknown" 

160 section_title = clean_node(wxr, None, level2_node.largs) 

161 if section_title.lower() == "referencias y notas": 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 continue 

163 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 163 ↛ 170line 163 didn't jump to line 170 because the loop on line 163 didn't complete

164 # https://es.wiktionary.org/wiki/Plantilla:lengua 

165 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma 

166 if subtitle_template.template_name == "lengua": 166 ↛ 163line 166 didn't jump to line 163 because the condition on line 166 was always true

167 lang_code = subtitle_template.template_parameters.get(1).lower() 

168 lang_name = clean_node(wxr, categories, subtitle_template) 

169 break 

170 if ( 170 ↛ 174line 170 didn't jump to line 174 because the condition on line 170 was never true

171 wxr.config.capture_language_codes is not None 

172 and lang_code not in wxr.config.capture_language_codes 

173 ): 

174 continue 

175 wxr.wtp.start_section(lang_name) 

176 base_data = WordEntry( 

177 lang=lang_name, 

178 lang_code=lang_code, 

179 word=page_title, 

180 pos="unknown", 

181 categories=categories.get("categories", []), 

182 ) 

183 for node in level2_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

184 if ( 

185 isinstance(node, TemplateNode) 

186 and node.template_name == "pron-graf" 

187 ): 

188 process_pron_graf_template(wxr, base_data, node) 

189 elif node.kind == NodeKind.LINK: 189 ↛ 183line 189 didn't jump to line 183 because the condition on line 189 was always true

190 clean_node(wxr, base_data, node) 

191 

192 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

193 parse_section(wxr, page_data, base_data, next_level_node) 

194 

195 for data in page_data: 

196 if len(data.senses) == 0: 

197 data.senses.append(Sense(tags=["no-gloss"])) 

198 return [d.model_dump(exclude_defaults=True) for d in page_data]