Coverage for src/wiktextract/extractor/nl/pos.py: 90%

130 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .example import ( 

8 EXAMPLE_TEMPLATES, 

9 extract_example_list_item, 

10 extract_example_template, 

11) 

12from .models import AltForm, Sense, WordEntry 

13from .section_titles import POS_DATA 

14from .tags import translate_raw_tags 

15 

16 

17def extract_pos_section( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 base_data: WordEntry, 

21 forms_data: WordEntry, 

22 level_node: LevelNode, 

23 pos_title: str, 

24) -> None: 

25 page_data.append(base_data.model_copy(deep=True)) 

26 page_data[-1].pos_title = pos_title 

27 pos_data = POS_DATA[pos_title] 

28 page_data[-1].pos = pos_data["pos"] 

29 page_data[-1].tags.extend(pos_data.get("tags", [])) 

30 if forms_data.pos == "unknown": 

31 forms_data.pos = page_data[-1].pos 

32 if forms_data.pos == page_data[-1].pos: 

33 page_data[-1].forms.extend(forms_data.forms) 

34 page_data[-1].categories.extend(forms_data.categories) 

35 else: 

36 forms_data.forms.clear() 

37 forms_data.categories.clear() 

38 extract_pos_section_nodes(wxr, page_data, base_data, forms_data, level_node) 

39 

40 

41def extract_pos_section_nodes( 

42 wxr: WiktextractContext, 

43 page_data: list[WordEntry], 

44 base_data: WordEntry, 

45 forms_data: WordEntry, 

46 level_node: LevelNode, 

47) -> None: 

48 gloss_list_start = 0 

49 for index, node in enumerate(level_node.children): 

50 if ( 

51 isinstance(node, WikiNode) 

52 and node.kind == NodeKind.LIST 

53 and node.sarg.endswith("#") 

54 ): 

55 if gloss_list_start == 0: 55 ↛ 60line 55 didn't jump to line 60 because the condition on line 55 was always true

56 gloss_list_start = index 

57 extract_pos_header_line_nodes( 

58 wxr, page_data[-1], level_node.children[:index] 

59 ) 

60 for list_item in node.find_child(NodeKind.LIST_ITEM): 

61 extract_gloss_list_item(wxr, page_data[-1], list_item) 

62 elif isinstance(node, LevelNode): 

63 title_text = clean_node(wxr, None, node.largs) 

64 if title_text in POS_DATA: 

65 # expanded from "eng-onv-d" form-of template 

66 from .page import parse_section 

67 

68 parse_section(wxr, page_data, base_data, forms_data, node) 

69 else: 

70 break 

71 elif ( 

72 isinstance(node, TemplateNode) 

73 and node.template_name in EXAMPLE_TEMPLATES 

74 and len(page_data[-1].senses) > 0 

75 ): 

76 extract_example_template(wxr, page_data[-1].senses[-1], node) 

77 elif isinstance(node, TemplateNode) and node.template_name in [ 

78 "noun-pl", 

79 "noun-form", 

80 ]: 

81 extract_noun_form_of_template(wxr, page_data[-1], node) 

82 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

83 ( 

84 "1ps", 

85 "2ps", 

86 "aanv-w", 

87 "onv-d", 

88 "ott-", 

89 "ovt-", 

90 "tps", 

91 "volt-d", 

92 "eng-onv-d", 

93 ) 

94 ): 

95 extract_verb_form_of_template( 

96 wxr, page_data, base_data, forms_data, node 

97 ) 

98 

99 

100# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen 

101# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

102GLOSS_TAG_TEMPLATES = frozenset(["auxl", "erga", "inerg"]) 

103 

104 

105def extract_gloss_list_item( 

106 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode 

107) -> None: 

108 sense = Sense() 

109 gloss_nodes = [] 

110 for child in list_item.children: 

111 if isinstance(child, TemplateNode): 

112 if child.template_name in GLOSS_TAG_TEMPLATES: 

113 sense.raw_tags.append(clean_node(wxr, sense, child)) 

114 else: 

115 expanded_text = clean_node(wxr, sense, child) 

116 if expanded_text.startswith("(") and expanded_text.endswith( 

117 ")" 

118 ): 

119 sense.raw_tags.append(expanded_text.strip("() ")) 

120 else: 

121 gloss_nodes.append(expanded_text) 

122 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

123 if child.sarg.endswith("*"): 123 ↛ 110line 123 didn't jump to line 110 because the condition on line 123 was always true

124 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

125 extract_example_list_item(wxr, sense, next_list_item) 

126 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC: 

127 italic_text = clean_node(wxr, sense, child) 

128 if italic_text.startswith("(") and italic_text.endswith(")"): 128 ↛ 131line 128 didn't jump to line 131 because the condition on line 128 was always true

129 sense.raw_tags.append(italic_text.strip("() ")) 

130 else: 

131 gloss_nodes.append(italic_text) 

132 else: 

133 gloss_nodes.append(child) 

134 

135 gloss_text = clean_node(wxr, sense, gloss_nodes) 

136 if gloss_text.startswith(","): # between qualifier templates 

137 gloss_text = gloss_text.removeprefix(",").strip() 

138 m = re.match(r"\(([^()]+)\)", gloss_text) 

139 if m is not None: # expanded "verouderd" template in "2ps" template 

140 gloss_text = gloss_text[m.end() :].strip() 

141 sense.raw_tags.append(m.group(1)) 

142 if len(gloss_text) > 0: 142 ↛ exitline 142 didn't return from function 'extract_gloss_list_item' because the condition on line 142 was always true

143 sense.glosses.append(gloss_text) 

144 translate_raw_tags(sense) 

145 word_entry.senses.append(sense) 

146 

147 

148def extract_pos_header_line_nodes( 

149 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

150) -> None: 

151 for node in nodes: 

152 if isinstance(node, str) and word_entry.etymology_index == "": 

153 m = re.search(r"\[(.+)\]", node.strip()) 

154 if m is not None: 

155 word_entry.etymology_index = m.group(1).strip() 

156 elif isinstance(node, TemplateNode) and node.template_name == "-l-": 

157 extract_l_template(wxr, word_entry, node) 

158 

159 

160def extract_l_template( 

161 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

162) -> None: 

163 # https://nl.wiktionary.org/wiki/Sjabloon:-l- 

164 first_arg = clean_node(wxr, None, node.template_parameters.get(1, "")) 

165 gender_args = { 

166 "n": "neuter", 

167 "m": "masculine", 

168 "fm": ["feminine", "masculine"], 

169 "p": "plural", 

170 } 

171 tag = gender_args.get(first_arg, []) 

172 if isinstance(tag, str): 

173 word_entry.tags.append(tag) 

174 elif isinstance(tag, list): 174 ↛ exitline 174 didn't return from function 'extract_l_template' because the condition on line 174 was always true

175 word_entry.tags.extend(tag) 

176 

177 

178# https://nl.wiktionary.org/wiki/Sjabloon:noun-pl 

179# https://nl.wiktionary.org/wiki/Sjabloon:noun-form 

180# "getal" and "gesl" args 

181NOUN_FORM_OF_TEMPLATE_NUM_TAGS = { 

182 "s": "singular", 

183 "p": "plural", 

184 "d": "dual", 

185 "c": "collective", 

186} 

187NOUN_FORM_OF_TEMPLATE_GENDER_TAGS = { 

188 "m": "masculine", 

189 "f": "feminine", 

190 "n": "neuter", 

191 "c": "common", 

192 "fm": ["feminine", "masculine"], 

193 "mf": ["feminine", "masculine"], 

194 "mn": ["masculine", "neuter"], 

195} 

196 

197 

198def extract_noun_form_of_template( 

199 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

200) -> None: 

201 sense = Sense(tags=["form-of"]) 

202 if t_node.template_name == "noun-pl": 

203 sense.tags.append("plural") 

204 else: 

205 num_arg = t_node.template_parameters.get("getal", "") 

206 if num_arg in NOUN_FORM_OF_TEMPLATE_NUM_TAGS: 206 ↛ 209line 206 didn't jump to line 209 because the condition on line 206 was always true

207 sense.tags.append(NOUN_FORM_OF_TEMPLATE_NUM_TAGS[num_arg]) 

208 

209 gender_arg = t_node.template_parameters.get("gesl", "") 

210 if gender_arg in NOUN_FORM_OF_TEMPLATE_GENDER_TAGS: 210 ↛ 211line 210 didn't jump to line 211 because the condition on line 210 was never true

211 gender_tag = NOUN_FORM_OF_TEMPLATE_GENDER_TAGS[gender_arg] 

212 if isinstance(gender_tag, str): 

213 sense.tags.append(gender_tag) 

214 elif isinstance(gender_tag, list): 

215 sense.tags.extend(gender_tag) 

216 

217 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

218 if form_of != "": 218 ↛ 221line 218 didn't jump to line 221 because the condition on line 218 was always true

219 sense.form_of.append(AltForm(word=form_of)) 

220 

221 expanded_node = wxr.wtp.parse( 

222 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

223 ) 

224 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 224 ↛ 227line 224 didn't jump to line 227 because the loop on line 224 didn't complete

225 sense.glosses.append(clean_node(wxr, None, list_item.children)) 

226 break 

227 clean_node(wxr, sense, expanded_node) 

228 word_entry.senses.append(sense) 

229 

230 

231def extract_verb_form_of_template( 

232 wxr: WiktextractContext, 

233 page_data: list[WordEntry], 

234 base_data: WordEntry, 

235 forms_data: WordEntry, 

236 t_node: TemplateNode, 

237) -> None: 

238 # https://nl.wiktionary.org/wiki/Categorie:Werkwoordsvormsjablonen_voor_het_Nederlands 

239 from .page import extract_section_categories 

240 

241 orig_data_len = len(page_data) 

242 expanded_node = wxr.wtp.parse( 

243 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

244 ) 

245 extract_pos_section_nodes( 

246 wxr, page_data, base_data, forms_data, expanded_node 

247 ) 

248 form_of = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

249 for word_entry in page_data[orig_data_len - len(page_data) - 1 :]: 

250 for sense in word_entry.senses: 

251 sense.tags.append("form-of") 

252 if form_of != "": 252 ↛ 250line 252 didn't jump to line 250 because the condition on line 252 was always true

253 sense.form_of.append(AltForm(word=form_of)) 

254 extract_section_categories(wxr, word_entry, expanded_node) 

255 word_entry.tags.append("form-of")