Coverage for src/wiktextract/extractor/nl/page.py: 82%

105 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Any 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .descendant import extract_descendant_section 

14from .etymology import extract_etymology_section 

15from .inflection import FORMS_TABLE_TEMPLATES, extract_inflection_template 

16from .linkage import extract_fixed_preposition_section, extract_linkage_section 

17from .models import Etymology, Sense, WordEntry 

18from .pos import extract_pos_section 

19from .section_titles import LINKAGE_SECTIONS, POS_DATA 

20from .sound import extract_hyphenation_section, extract_sound_section 

21from .spelling_form import extract_spelling_form_section 

22from .translation import extract_translation_section 

23 

24 

25def extract_section_categories( 

26 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

27) -> None: 

28 for link_node in level_node.find_child(NodeKind.LINK): 

29 clean_node(wxr, word_entry, link_node) 

30 

31 

32def select_word_entry( 

33 page_data: list[WordEntry], base_data: WordEntry 

34) -> WordEntry: 

35 # use a function not a variable because new data could be appended to 

36 # `page_data` after the variable is created 

37 return ( 

38 page_data[-1] 

39 if len(page_data) > 0 and page_data[-1].lang_code == base_data.lang_code 

40 else base_data 

41 ) 

42 

43 

44def parse_section( 

45 wxr: WiktextractContext, 

46 page_data: list[WordEntry], 

47 base_data: WordEntry, 

48 forms_data: WordEntry, 

49 level_node: LevelNode, 

50) -> list[Etymology]: 

51 # title templates 

52 # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen 

53 title_text = clean_node(wxr, None, level_node.largs) 

54 title_text = re.sub(r"\s+#?\d+:?$", "", title_text) 

55 wxr.wtp.start_subsection(title_text) 

56 etymology_data = [] 

57 

58 if title_text in POS_DATA: 

59 last_data_len = len(page_data) 

60 extract_pos_section( 

61 wxr, page_data, base_data, forms_data, level_node, title_text 

62 ) 

63 if len(page_data) == last_data_len and title_text in LINKAGE_SECTIONS: 

64 extract_linkage_section( 

65 wxr, 

66 page_data[-1] if len(page_data) > 0 else base_data, 

67 level_node, 

68 LINKAGE_SECTIONS[title_text], 

69 ) 

70 elif title_text == "Uitspraak": 

71 extract_sound_section( 

72 wxr, select_word_entry(page_data, base_data), level_node 

73 ) 

74 elif title_text in LINKAGE_SECTIONS: 

75 extract_linkage_section( 

76 wxr, 

77 select_word_entry(page_data, base_data), 

78 level_node, 

79 LINKAGE_SECTIONS[title_text], 

80 ) 

81 elif title_text == "Vertalingen": 

82 extract_translation_section( 

83 wxr, select_word_entry(page_data, base_data), level_node 

84 ) 

85 elif title_text == "Woordafbreking": 

86 extract_hyphenation_section( 

87 wxr, select_word_entry(page_data, base_data), level_node 

88 ) 

89 elif title_text == "Woordherkomst en -opbouw": 

90 etymology_data = extract_etymology_section(wxr, level_node) 

91 elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]: 

92 extract_spelling_form_section( 

93 wxr, select_word_entry(page_data, base_data), level_node 

94 ) 

95 elif title_text == "Opmerkingen": 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 extract_note_section( 

97 wxr, select_word_entry(page_data, base_data), level_node 

98 ) 

99 elif title_text == "Overerving en ontlening": 99 ↛ 103line 99 didn't jump to line 103 because the condition on line 99 was always true

100 extract_descendant_section( 

101 wxr, select_word_entry(page_data, base_data), level_node 

102 ) 

103 elif title_text == "Vaste voorzetsels": 

104 extract_fixed_preposition_section( 

105 wxr, select_word_entry(page_data, base_data), level_node 

106 ) 

107 elif title_text in [ 

108 "Gangbaarheid", 

109 "Meer informatie", 

110 "Verwijzingen", 

111 "Citaten", 

112 ]: 

113 pass # ignore 

114 elif not title_text.startswith(("Vervoeging", "Verbuiging")): 

115 wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60") 

116 

117 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

118 parse_section(wxr, page_data, base_data, forms_data, next_level) 

119 extract_section_categories( 

120 wxr, select_word_entry(page_data, base_data), level_node 

121 ) 

122 is_first_forms_template = True 

123 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

124 if t_node.template_name in FORMS_TABLE_TEMPLATES: 

125 if is_first_forms_template: 125 ↛ 130line 125 didn't jump to line 130 because the condition on line 125 was always true

126 is_first_forms_template = False 

127 if len(forms_data.forms) > 0: 

128 forms_data.forms.clear() 

129 forms_data.extracted_vervoeging_page = False 

130 extract_inflection_template( 

131 wxr, 

132 page_data[-1] 

133 if title_text.startswith(("Vervoeging", "Verbuiging")) 

134 and len(page_data) > 0 

135 and page_data[-1].lang_code == base_data.lang_code 

136 else forms_data, 

137 t_node, 

138 ) 

139 return etymology_data 

140 

141 

142def parse_page( 

143 wxr: WiktextractContext, page_title: str, page_text: str 

144) -> list[dict[str, Any]]: 

145 # page layout 

146 # https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien 

147 # language templates 

148 # https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen 

149 if page_title.endswith("/vervoeging"): 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 return [] # skip conjugation pages 

151 wxr.wtp.start_page(page_title) 

152 tree = wxr.wtp.parse(page_text, pre_expand=True) 

153 page_data: list[WordEntry] = [] 

154 for level2_node in tree.find_child(NodeKind.LEVEL2): 

155 lang_name = clean_node(wxr, None, level2_node.largs) 

156 lang_code = name_to_code(lang_name, "nl") 

157 if lang_code == "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 lang_code = "unknown" 

159 if ( 159 ↛ 163line 159 didn't jump to line 163 because the condition on line 159 was never true

160 wxr.config.capture_language_codes is not None 

161 and lang_code not in wxr.config.capture_language_codes 

162 ): 

163 continue 

164 wxr.wtp.start_section(lang_name) 

165 base_data = WordEntry( 

166 word=wxr.wtp.title, 

167 lang_code=lang_code, 

168 lang=lang_name, 

169 pos="unknown", 

170 ) 

171 forms_data = base_data.model_copy(deep=True) 

172 extract_section_categories(wxr, base_data, level2_node) 

173 etymology_data = [] 

174 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

175 extract_inflection_template(wxr, forms_data, t_node) 

176 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

177 new_e_data = parse_section( 

178 wxr, page_data, base_data, forms_data, next_level_node 

179 ) 

180 if len(new_e_data) > 0: 

181 etymology_data = new_e_data 

182 for data in page_data: 

183 if data.lang_code == lang_code: 

184 for e_data in etymology_data: 

185 if ( 

186 e_data.index == data.etymology_index 

187 or e_data.index == "" 

188 ): 

189 data.etymology_texts.append(e_data.text) 

190 data.categories.extend(e_data.categories) 

191 

192 for data in page_data: 

193 if len(data.senses) == 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 data.senses.append(Sense(tags=["no-gloss"])) 

195 return [m.model_dump(exclude_defaults=True) for m in page_data] 

196 

197 

198def extract_note_section( 

199 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

200) -> None: 

201 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

202 note_str = clean_node(wxr, word_entry, list_item.children) 

203 if len(note_str) > 0: 

204 word_entry.notes.append(note_str)