Coverage for src/wiktextract/extractor/pt/page.py: 77%

97 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .etymology import extract_etymology_section 

12from .inflection import extract_conjugation_section, extract_degree_section 

13from .linkage import ( 

14 extract_expression_section, 

15 extract_forms_section, 

16 extract_linkage_section, 

17 extract_phraseology_section, 

18) 

19from .models import Sense, WordEntry 

20from .pos import extract_pos_section 

21from .pronunciation import extract_pronunciation_section 

22from .section_titles import ( 

23 FORM_SECTION_TAGS, 

24 LINKAGE_SECTIONS, 

25 LINKAGE_TAGS, 

26 POS_DATA, 

27) 

28from .translation import extract_translation_section 

29 

30 

31def parse_section( 

32 wxr: WiktextractContext, 

33 page_data: list[WordEntry], 

34 base_data: WordEntry, 

35 level_node: LevelNode, 

36) -> None: 

37 cats = {} 

38 title_text = clean_node(wxr, cats, level_node.largs).strip( 

39 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789: \n" 

40 ) 

41 if title_text.lower() in POS_DATA: 

42 extract_pos_section( 

43 wxr, 

44 page_data, 

45 base_data, 

46 level_node, 

47 title_text, 

48 cats.get("categories", []), 

49 ) 

50 if len(page_data[-1].senses) == 0 and title_text in FORM_SECTION_TAGS: 

51 page_data.pop() 

52 extract_forms_section( 

53 wxr, 

54 page_data[-1] if len(page_data) > 0 else base_data, 

55 level_node, 

56 title_text, 

57 ) 

58 elif len(page_data[-1].senses) == 0 and title_text == "Expressão": 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 page_data.pop() 

60 extract_expression_section( 

61 wxr, 

62 page_data[-1] if len(page_data) > 0 else base_data, 

63 level_node, 

64 ) 

65 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]: 

66 extract_translation_section( 

67 wxr, 

68 page_data[-1] if len(page_data) > 0 else base_data, 

69 level_node, 

70 title_text, 

71 ) 

72 elif title_text == "Expressões": 

73 extract_expression_section( 

74 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

75 ) 

76 elif title_text.lower() in LINKAGE_SECTIONS: 

77 extract_linkage_section( 

78 wxr, 

79 page_data[-1] if len(page_data) > 0 else base_data, 

80 level_node, 

81 LINKAGE_SECTIONS[title_text.lower()], 

82 "", 

83 0, 

84 "", 

85 LINKAGE_TAGS.get(title_text.lower(), []), 

86 ) 

87 elif title_text == "Etimologia": 

88 extract_etymology_section(wxr, page_data, level_node) 

89 elif title_text in ["Pronúncia", "Romanização"]: 

90 extract_pronunciation_section( 

91 wxr, page_data if len(page_data) else [base_data], level_node 

92 ) 

93 elif title_text == "Fraseologia": 

94 extract_phraseology_section( 

95 wxr, page_data[-1] if len(page_data) else base_data, level_node 

96 ) 

97 elif title_text.startswith(("Nota", "Uso")): 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 extract_note_section(wxr, page_data, level_node) 

99 elif title_text == "Conjugação": 

100 extract_conjugation_section( 

101 wxr, page_data[-1] if len(page_data) else base_data, level_node 

102 ) 

103 elif title_text == "Graus": 

104 extract_degree_section( 

105 wxr, page_data[-1] if len(page_data) else base_data, level_node 

106 ) 

107 elif title_text in FORM_SECTION_TAGS: 

108 extract_forms_section( 

109 wxr, 

110 page_data[-1] if len(page_data) > 0 else base_data, 

111 level_node, 

112 title_text, 

113 ) 

114 elif title_text.lower() not in [ 114 ↛ 134line 114 didn't jump to line 134 because the condition on line 114 was never true

115 "ver também", 

116 "ligação externa", 

117 "ligações externas", 

118 "ligação extena", 

119 "referências", 

120 "referência", 

121 "no wikcionário", 

122 "na wikipédia", 

123 "no wikiquote", 

124 "no wikispecies", 

125 "no wikisaurus", 

126 "no commons", 

127 "no wikimedia commons", 

128 "na internet", 

129 "galeria", 

130 "galeria de imagens", 

131 "brasil", 

132 "portugal", 

133 ]: 

134 wxr.wtp.debug(f"unknown section: {title_text}") 

135 

136 if title_text.lower() not in POS_DATA: 

137 save_section_cats( 

138 cats.get("categories", []), page_data, level_node, True 

139 ) 

140 cats = {} 

141 for link_node in level_node.find_child(NodeKind.LINK): 141 ↛ 142line 141 didn't jump to line 142 because the loop on line 141 never started

142 clean_node(wxr, cats, link_node) 

143 save_section_cats(cats.get("categories", []), page_data, level_node, False) 

144 

145 if title_text.lower() not in ["pronúncia", "ver também"]: 

146 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

147 parse_section(wxr, page_data, base_data, next_level) 

148 

149 

150def save_section_cats( 

151 cats: list[str], 

152 page_data: list[WordEntry], 

153 level_node: LevelNode, 

154 from_title: bool, 

155) -> None: 

156 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2): 

157 for data in page_data: 

158 if data.lang_code == page_data[-1].lang_code: 158 ↛ 157line 158 didn't jump to line 157 because the condition on line 158 was always true

159 data.categories.extend(cats) 

160 elif len(page_data) > 0: 160 ↛ exitline 160 didn't return from function 'save_section_cats' because the condition on line 160 was always true

161 page_data[-1].categories.extend(cats) 

162 

163 

164def parse_page( 

165 wxr: WiktextractContext, page_title: str, page_text: str 

166) -> list[dict[str, Any]]: 

167 # page layout 

168 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo 

169 if ( 169 ↛ 175line 169 didn't jump to line 175 because the condition on line 169 was never true

170 "/traduções" in page_title 

171 or "/tradução" in page_title 

172 or page_title.startswith("Wikisaurus:") 

173 ): 

174 # skip translation and thesaurus pages 

175 return [] 

176 wxr.wtp.start_page(page_title) 

177 tree = wxr.wtp.parse(page_text) 

178 page_data: list[WordEntry] = [] 

179 for level1_node in tree.find_child(NodeKind.LEVEL1): 

180 lang_cats = {} 

181 lang_name = clean_node(wxr, lang_cats, level1_node.largs) 

182 if lang_name == "": 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 lang_name = "unknown" 

184 lang_code = "unknown" 

185 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 185 ↛ 190line 185 didn't jump to line 190 because the loop on line 185 didn't complete

186 lang_code = lang_template.template_name.strip("-") 

187 if lang_code == "": # template "--" 187 ↛ 188line 187 didn't jump to line 188 because the condition on line 187 was never true

188 lang_code = "unknown" 

189 break 

190 if ( 190 ↛ 194line 190 didn't jump to line 194 because the condition on line 190 was never true

191 wxr.config.capture_language_codes is not None 

192 and lang_code not in wxr.config.capture_language_codes 

193 ): 

194 continue 

195 wxr.wtp.start_section(lang_name) 

196 base_data = WordEntry( 

197 word=wxr.wtp.title, 

198 lang_code=lang_code, 

199 lang=lang_name, 

200 pos="unknown", 

201 categories=lang_cats.get("categories", []), 

202 ) 

203 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS): 

204 parse_section(wxr, page_data, base_data, next_level_node) 

205 

206 for data in page_data: 

207 if len(data.senses) == 0: 207 ↛ 208line 207 didn't jump to line 208 because the condition on line 207 was never true

208 data.senses.append(Sense(tags=["no-gloss"])) 

209 return [m.model_dump(exclude_defaults=True) for m in page_data] 

210 

211 

212def extract_note_section( 

213 wxr: WiktextractContext, 

214 page_data: list[WordEntry], 

215 level_node: LevelNode, 

216) -> None: 

217 notes = [] 

218 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

219 note = clean_node( 

220 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

221 ) 

222 if note != "": 

223 notes.append(note) 

224 for data in page_data: 

225 if data.lang_code == page_data[-1].lang_code: 

226 data.notes.extend(notes)