Coverage for src/wiktextract/extractor/pt/page.py: 75%

89 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .etymology import extract_etymology_section 

12from .inflection import extract_conjugation_section, extract_degree_section 

13from .linkage import ( 

14 extract_expression_section, 

15 extract_linkage_section, 

16 extract_phraseology_section, 

17) 

18from .models import Sense, WordEntry 

19from .pos import extract_pos_section 

20from .pronunciation import extract_pronunciation_section 

21from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA 

22from .translation import extract_translation_section 

23 

24 

25def parse_section( 

26 wxr: WiktextractContext, 

27 page_data: list[WordEntry], 

28 base_data: WordEntry, 

29 level_node: LevelNode, 

30) -> None: 

31 cats = {} 

32 title_text = clean_node(wxr, cats, level_node.largs).strip( 

33 "⁰¹²³⁴⁵⁶⁷⁸⁹0123456789:" 

34 ) 

35 if title_text.lower() in POS_DATA: 

36 extract_pos_section( 

37 wxr, 

38 page_data, 

39 base_data, 

40 level_node, 

41 title_text, 

42 cats.get("categories", []), 

43 ) 

44 elif title_text in ["Tradução", "Traduções", "Cognatos", "Descendentes"]: 

45 extract_translation_section( 

46 wxr, 

47 page_data[-1] if len(page_data) > 0 else base_data, 

48 level_node, 

49 title_text, 

50 ) 

51 elif title_text == "Expressões": 

52 extract_expression_section( 

53 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

54 ) 

55 elif title_text.lower() in LINKAGE_SECTIONS: 

56 extract_linkage_section( 

57 wxr, 

58 page_data[-1] if len(page_data) > 0 else base_data, 

59 level_node, 

60 LINKAGE_SECTIONS[title_text.lower()], 

61 "", 

62 0, 

63 "", 

64 LINKAGE_TAGS.get(title_text.lower(), []), 

65 ) 

66 elif title_text == "Etimologia": 

67 extract_etymology_section(wxr, page_data, level_node) 

68 elif title_text == "Pronúncia": 

69 extract_pronunciation_section(wxr, page_data, level_node) 

70 elif title_text == "Fraseologia": 

71 extract_phraseology_section( 

72 wxr, page_data[-1] if len(page_data) else base_data, level_node 

73 ) 

74 elif title_text.startswith(("Nota", "Uso")): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 extract_note_section(wxr, page_data, level_node) 

76 elif title_text == "Conjugação": 

77 extract_conjugation_section( 

78 wxr, page_data[-1] if len(page_data) else base_data, level_node 

79 ) 

80 elif title_text == "Graus": 80 ↛ 84line 80 didn't jump to line 84 because the condition on line 80 was always true

81 extract_degree_section( 

82 wxr, page_data[-1] if len(page_data) else base_data, level_node 

83 ) 

84 elif title_text.lower() not in [ 

85 "ver também", 

86 "ligação externa", 

87 "ligações externas", 

88 "ligação extena", 

89 "referências", 

90 "referência", 

91 "no wikcionário", 

92 "na wikipédia", 

93 "no wikiquote", 

94 "no wikispecies", 

95 "no wikisaurus", 

96 "no commons", 

97 "no wikimedia commons", 

98 "na internet", 

99 "galeria", 

100 "galeria de imagens", 

101 ]: 

102 wxr.wtp.debug(f"unknown section: {title_text}") 

103 

104 if title_text.lower() not in POS_DATA: 

105 save_section_cats( 

106 cats.get("categories", []), page_data, level_node, True 

107 ) 

108 cats = {} 

109 for link_node in level_node.find_child(NodeKind.LINK): 109 ↛ 110line 109 didn't jump to line 110 because the loop on line 109 never started

110 clean_node(wxr, cats, link_node) 

111 save_section_cats(cats.get("categories", []), page_data, level_node, False) 

112 

113 if title_text.lower() not in ["pronúncia", "ver também"]: 

114 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

115 parse_section(wxr, page_data, base_data, next_level) 

116 

117 

118def save_section_cats( 

119 cats: list[str], 

120 page_data: list[WordEntry], 

121 level_node: LevelNode, 

122 from_title: bool, 

123) -> None: 

124 if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2): 

125 for data in page_data: 

126 if data.lang_code == page_data[-1].lang_code: 126 ↛ 125line 126 didn't jump to line 125 because the condition on line 126 was always true

127 data.categories.extend(cats) 

128 elif len(page_data) > 0: 128 ↛ exitline 128 didn't return from function 'save_section_cats' because the condition on line 128 was always true

129 page_data[-1].categories.extend(cats) 

130 

131 

132def parse_page( 

133 wxr: WiktextractContext, page_title: str, page_text: str 

134) -> list[dict[str, Any]]: 

135 # page layout 

136 # https://pt.wiktionary.org/wiki/Wikcionário:Livro_de_estilo 

137 if "/traduções" in page_title or page_title.startswith("Wikisaurus:"): 137 ↛ 139line 137 didn't jump to line 139 because the condition on line 137 was never true

138 # skip translation and thesaurus pages 

139 return [] 

140 wxr.wtp.start_page(page_title) 

141 tree = wxr.wtp.parse(page_text) 

142 page_data: list[WordEntry] = [] 

143 for level1_node in tree.find_child(NodeKind.LEVEL1): 

144 lang_cats = {} 

145 lang_name = clean_node(wxr, lang_cats, level1_node.largs) 

146 if lang_name == "": 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 lang_name = "unknown" 

148 lang_code = "unknown" 

149 for lang_template in level1_node.find_content(NodeKind.TEMPLATE): 149 ↛ 154line 149 didn't jump to line 154 because the loop on line 149 didn't complete

150 lang_code = lang_template.template_name.strip("-") 

151 if lang_code == "": # template "--" 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 lang_code = "unknown" 

153 break 

154 if ( 154 ↛ 158line 154 didn't jump to line 158 because the condition on line 154 was never true

155 wxr.config.capture_language_codes is not None 

156 and lang_code not in wxr.config.capture_language_codes 

157 ): 

158 continue 

159 wxr.wtp.start_section(lang_name) 

160 base_data = WordEntry( 

161 word=wxr.wtp.title, 

162 lang_code=lang_code, 

163 lang=lang_name, 

164 pos="unknown", 

165 categories=lang_cats.get("categories", []), 

166 ) 

167 for next_level_node in level1_node.find_child(LEVEL_KIND_FLAGS): 

168 parse_section(wxr, page_data, base_data, next_level_node) 

169 

170 for data in page_data: 

171 if len(data.senses) == 0: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 data.senses.append(Sense(tags=["no-gloss"])) 

173 return [m.model_dump(exclude_defaults=True) for m in page_data] 

174 

175 

176def extract_note_section( 

177 wxr: WiktextractContext, 

178 page_data: list[WordEntry], 

179 level_node: LevelNode, 

180) -> None: 

181 notes = [] 

182 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

183 note = clean_node( 

184 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

185 ) 

186 if note != "": 

187 notes.append(note) 

188 for data in page_data: 

189 if data.lang_code == page_data[-1].lang_code: 

190 data.notes.extend(notes)