Coverage for src/wiktextract/extractor/pt/page.py: 77%

97 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .etymology import extract_etymology_section 

12from .inflection import extract_conjugation_section, extract_degree_section 

13from .linkage import ( 

14 extract_expression_section, 

15 extract_forms_section, 

16 extract_linkage_section, 

17 extract_phraseology_section, 

18) 

19from .models import Sense, WordEntry 

20from .pos import extract_pos_section 

21from .pronunciation import extract_pronunciation_section 

22from .section_titles import ( 

23 FORM_SECTION_TAGS, 

24 LINKAGE_SECTIONS, 

25 LINKAGE_TAGS, 

26 POS_DATA, 

27) 

28from .translation import extract_translation_section 

29 

30 

31def parse_section( 

32 wxr: WiktextractContext, 

33 page_data: list[WordEntry], 

34 base_data: WordEntry, 

35 level_node: LevelNode, 

36) -> None: 

37 cats = {} 

38 title_text = clean_node(wxr, cats, level_node.largs).strip( 

39 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789: \n" 

40 ) 

41 if title_text.lower() in POS_DATA: 

42 extract_pos_section( 

43 wxr, 

44 page_data, 

45 base_data, 

46 level_node, 

47 title_text, 

48 cats.get("categories", []), 

49 ) 

50 if len(page_data[-1].senses) == 0 and title_text in FORM_SECTION_TAGS: 

51 page_data.pop() 

52 extract_forms_section( 

53 wxr, 

54 page_data[-1] if len(page_data) > 0 else base_data, 

55 level_node, 

56 title_text, 

57 ) 

58 elif len(page_data[-1].senses) == 0 and title_text == "Expressão": 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 page_data.pop() 

60 extract_expression_section( 

61 wxr, 

62 page_data[-1] if len(page_data) > 0 else base_data, 

63 level_node, 

64 ) 

65 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]: 

66 extract_translation_section( 

67 wxr, 

68 page_data[-1] if len(page_data) > 0 else base_data, 

69 level_node, 

70 title_text, 

71 ) 

72 elif title_text == "Expressões": 

73 extract_expression_section( 

74 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

75 ) 

76 elif title_text.lower() in LINKAGE_SECTIONS: 

77 extract_linkage_section( 

78 wxr, 

79 page_data[-1] if len(page_data) > 0 else base_data, 

80 level_node, 

81 LINKAGE_SECTIONS[title_text.lower()], 

82 "", 

83 0, 

84 "", 

85 LINKAGE_TAGS.get(title_text.lower(), []), 

86 ) 

87 elif title_text == "Etimologia": 

88 extract_etymology_section(wxr, page_data, level_node) 

89 elif title_text in ["Pronúncia", "Romanização"]: 

90 extract_pronunciation_section(wxr, page_data, level_node) 

91 elif title_text == "Fraseologia": 

92 extract_phraseology_section( 

93 wxr, page_data[-1] if len(page_data) else base_data, level_node 

94 ) 

95 elif title_text.startswith(("Nota", "Uso")): 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 extract_note_section(wxr, page_data, level_node) 

97 elif title_text == "Conjugação": 

98 extract_conjugation_section( 

99 wxr, page_data[-1] if len(page_data) else base_data, level_node 

100 ) 

101 elif title_text == "Graus": 

102 extract_degree_section( 

103 wxr, page_data[-1] if len(page_data) else base_data, level_node 

104 ) 

105 elif title_text in FORM_SECTION_TAGS: 

106 extract_forms_section( 

107 wxr, 

108 page_data[-1] if len(page_data) > 0 else base_data, 

109 level_node, 

110 title_text, 

111 ) 

112 elif title_text.lower() not in [ 112 ↛ 132line 112 didn't jump to line 132 because the condition on line 112 was never true

113 "ver também", 

114 "ligação externa", 

115 "ligações externas", 

116 "ligação extena", 

117 "referências", 

118 "referência", 

119 "no wikcionário", 

120 "na wikipédia", 

121 "no wikiquote", 

122 "no wikispecies", 

123 "no wikisaurus", 

124 "no commons", 

125 "no wikimedia commons", 

126 "na internet", 

127 "galeria", 

128 "galeria de imagens", 

129 "brasil", 

130 "portugal", 

131 ]: 

132 wxr.wtp.debug(f"unknown section: {title_text}") 

133 

134 if title_text.lower() not in POS_DATA: 

135 save_section_cats( 

136 cats.get("categories", []), page_data, level_node, True 

137 ) 

138 cats = {} 

139 for link_node in level_node.find_child(NodeKind.LINK): 139 ↛ 140line 139 didn't jump to line 140 because the loop on line 139 never started

140 clean_node(wxr, cats, link_node) 

141 save_section_cats(cats.get("categories", []), page_data, level_node, False) 

142 

143 if title_text.lower() not in ["pronúncia", "ver também"]: 

144 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

145 parse_section(wxr, page_data, base_data, next_level) 

146 

147 

148def save_section_cats( 

149 cats: list[str], 

150 page_data: list[WordEntry], 

151 level_node: LevelNode, 

152 from_title: bool, 

153) -> None: 

154 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2): 

155 for data in page_data: 

156 if data.lang_code == page_data[-1].lang_code: 156 ↛ 155line 156 didn't jump to line 155 because the condition on line 156 was always true

157 data.categories.extend(cats) 

158 elif len(page_data) > 0: 158 ↛ exitline 158 didn't return from function 'save_section_cats' because the condition on line 158 was always true

159 page_data[-1].categories.extend(cats) 

160 

161 

162def parse_page( 

163 wxr: WiktextractContext, page_title: str, page_text: str 

164) -> list[dict[str, Any]]: 

165 # page layout 

166 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo 

167 if ( 167 ↛ 173line 167 didn't jump to line 173 because the condition on line 167 was never true

168 "/traduções" in page_title 

169 or "/tradução" in page_title 

170 or page_title.startswith("Wikisaurus:") 

171 ): 

172 # skip translation and thesaurus pages 

173 return [] 

174 wxr.wtp.start_page(page_title) 

175 tree = wxr.wtp.parse(page_text) 

176 page_data: list[WordEntry] = [] 

177 for level1_node in tree.find_child(NodeKind.LEVEL1): 

178 lang_cats = {} 

179 lang_name = clean_node(wxr, lang_cats, level1_node.largs) 

180 if lang_name == "": 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 lang_name = "unknown" 

182 lang_code = "unknown" 

183 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 183 ↛ 188line 183 didn't jump to line 188 because the loop on line 183 didn't complete

184 lang_code = lang_template.template_name.strip("-") 

185 if lang_code == "": # template "--" 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 lang_code = "unknown" 

187 break 

188 if ( 188 ↛ 192line 188 didn't jump to line 192 because the condition on line 188 was never true

189 wxr.config.capture_language_codes is not None 

190 and lang_code not in wxr.config.capture_language_codes 

191 ): 

192 continue 

193 wxr.wtp.start_section(lang_name) 

194 base_data = WordEntry( 

195 word=wxr.wtp.title, 

196 lang_code=lang_code, 

197 lang=lang_name, 

198 pos="unknown", 

199 categories=lang_cats.get("categories", []), 

200 ) 

201 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS): 

202 parse_section(wxr, page_data, base_data, next_level_node) 

203 

204 for data in page_data: 

205 if len(data.senses) == 0: 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 data.senses.append(Sense(tags=["no-gloss"])) 

207 return [m.model_dump(exclude_defaults=True) for m in page_data] 

208 

209 

210def extract_note_section( 

211 wxr: WiktextractContext, 

212 page_data: list[WordEntry], 

213 level_node: LevelNode, 

214) -> None: 

215 notes = [] 

216 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

217 note = clean_node( 

218 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

219 ) 

220 if note != "": 

221 notes.append(note) 

222 for data in page_data: 

223 if data.lang_code == page_data[-1].lang_code: 

224 data.notes.extend(notes)