Coverage for src/wiktextract/extractor/nl/page.py: 77%

92 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Any 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .descendant import extract_descendant_section 

14from .etymology import extract_etymology_section 

15from .inflection import extract_inflection_template 

16from .linkage import extract_fixed_preposition_section, extract_linkage_section 

17from .models import Etymology, Sense, WordEntry 

18from .pos import extract_pos_section 

19from .section_titles import LINKAGE_SECTIONS, POS_DATA 

20from .sound import extract_hyphenation_section, extract_sound_section 

21from .spelling_form import extract_spelling_form_section 

22from .translation import extract_translation_section 

23 

24 

25def extract_section_categories( 

26 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

27) -> None: 

28 for link_node in level_node.find_child(NodeKind.LINK): 

29 clean_node(wxr, word_entry, link_node) 

30 

31 

32def parse_section( 

33 wxr: WiktextractContext, 

34 page_data: list[WordEntry], 

35 base_data: WordEntry, 

36 forms_data: WordEntry, 

37 level_node: WikiNode, 

38) -> list[Etymology]: 

39 # title templates 

40 # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen 

41 title_text = clean_node(wxr, None, level_node.largs) 

42 wxr.wtp.start_subsection(title_text) 

43 etymology_data = [] 

44 if title_text in POS_DATA: 

45 extract_pos_section( 

46 wxr, page_data, base_data, forms_data, level_node, title_text 

47 ) 

48 elif title_text == "Uitspraak": 

49 extract_sound_section( 

50 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

51 ) 

52 elif title_text in LINKAGE_SECTIONS: 

53 extract_linkage_section( 

54 wxr, 

55 page_data[-1] if len(page_data) > 0 else base_data, 

56 level_node, 

57 LINKAGE_SECTIONS[title_text], 

58 ) 

59 elif title_text == "Vertalingen": 

60 extract_translation_section( 

61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

62 ) 

63 elif title_text == "Woordafbreking": 

64 extract_hyphenation_section( 

65 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

66 ) 

67 elif title_text == "Woordherkomst en -opbouw": 

68 etymology_data = extract_etymology_section(wxr, level_node) 

69 elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]: 

70 extract_spelling_form_section( 

71 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

72 ) 

73 elif title_text == "Opmerkingen": 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 extract_note_section( 

75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

76 ) 

77 elif title_text == "Overerving en ontlening": 77 ↛ 81line 77 didn't jump to line 81 because the condition on line 77 was always true

78 extract_descendant_section( 

79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

80 ) 

81 elif title_text == "Vaste voorzetsels": 

82 extract_fixed_preposition_section( 

83 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

84 ) 

85 elif title_text == "Vervoeging": 

86 pass # conjugation 

87 elif title_text == "Verbuiging": 

88 pass # inflection 

89 elif title_text in [ 

90 "Gangbaarheid", 

91 "Meer informatie", 

92 "Verwijzingen", 

93 "Citaten", 

94 ]: 

95 pass # ignore 

96 else: 

97 wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60") 

98 

99 for next_level in level_node.find_child(LEVEL_KIND_FLAGS): 

100 parse_section(wxr, page_data, base_data, forms_data, next_level) 

101 extract_section_categories( 

102 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node 

103 ) 

104 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

105 extract_inflection_template(wxr, forms_data, t_node) 

106 return etymology_data 

107 

108 

109def parse_page( 

110 wxr: WiktextractContext, page_title: str, page_text: str 

111) -> list[dict[str, Any]]: 

112 # page layout 

113 # https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien 

114 # language templates 

115 # https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen 

116 if page_title.endswith("/vervoeging"): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 return [] # skip conjugation pages 

118 wxr.wtp.start_page(page_title) 

119 tree = wxr.wtp.parse(page_text, pre_expand=True) 

120 page_data: list[WordEntry] = [] 

121 for level2_node in tree.find_child(NodeKind.LEVEL2): 

122 lang_name = clean_node(wxr, None, level2_node.largs) 

123 lang_code = name_to_code(lang_name, "nl") 

124 if lang_code == "": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true

125 lang_code = "unknown" 

126 if ( 126 ↛ 130line 126 didn't jump to line 130

127 wxr.config.capture_language_codes is not None 

128 and lang_code not in wxr.config.capture_language_codes 

129 ): 

130 continue 

131 wxr.wtp.start_section(lang_name) 

132 base_data = WordEntry( 

133 word=wxr.wtp.title, 

134 lang_code=lang_code, 

135 lang=lang_name, 

136 pos="unknown", 

137 ) 

138 forms_data = base_data.model_copy(deep=True) 

139 extract_section_categories(wxr, base_data, level2_node) 

140 etymology_data = [] 

141 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS): 

142 new_e_data = parse_section( 

143 wxr, page_data, base_data, forms_data, next_level_node 

144 ) 

145 if len(new_e_data) > 0: 

146 etymology_data = new_e_data 

147 for data in page_data: 

148 if data.lang_code == lang_code: 148 ↛ 147line 148 didn't jump to line 147 because the condition on line 148 was always true

149 for e_data in etymology_data: 

150 if ( 

151 e_data.index == data.etymology_index 

152 or e_data.index == "" 

153 ): 

154 data.etymology_texts.append(e_data.text) 

155 data.categories.extend(e_data.categories) 

156 

157 for data in page_data: 

158 if len(data.senses) == 0: 

159 data.senses.append(Sense(tags=["no-gloss"])) 

160 return [m.model_dump(exclude_defaults=True) for m in page_data] 

161 

162 

163def extract_note_section( 

164 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

165) -> None: 

166 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

167 note_str = clean_node(wxr, word_entry, list_item.children) 

168 if len(note_str) > 0: 

169 word_entry.notes.append(note_str)