Coverage for src/wiktextract/extractor/pl/pos.py: 78%

119 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import AltForm, Sense, WordEntry 

8from .tags import TAGS, translate_raw_tags 

9 

10# All POS categories 

11# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_wg_języków 

12# Polish POS 

13# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_języka_polskiego 

14POS_DATA = { 

15 "czasownik": {"pos": "verb"}, 

16 "czasownika": {"pos": "verb"}, 

17 # Szablon:szwedzki czasownik frazowy 

18 "czasownik frazowy (partikelverb)": {"pos": "verb", "tags": ["phrase"]}, 

19 "fraza": {"pos": "phrase"}, 

20 "klasyfikator": {"pos": "classifier"}, 

21 "liczebnik": {"pos": "num"}, 

22 "liczebnikowa": {"pos": "num"}, 

23 "międzyrostek": {"pos": "interfix", "tags": ["morpheme"]}, 

24 "morfem": {"pos": "unknown", "tags": ["morpheme"]}, 

25 "określnik": {"pos": "det"}, 

26 "partykuła": {"pos": "particle"}, 

27 "partykułowa": {"pos": "particle"}, 

28 # Szablon:phrasal verb 

29 "phrasal verb (czasownik frazowy)": {"pos": "verb", "tags": ["phrase"]}, 

30 "przedimek": {"pos": "article"}, 

31 "przedrostek": {"pos": "prefix", "tags": ["morpheme"]}, 

32 "przedrostkowy": {"pos": "prefix", "tags": ["morpheme"]}, 

33 "przyimek": {"pos": "prep"}, 

34 "przyimkowa": {"pos": "prep_phrase"}, 

35 "przymiotnik": {"pos": "adj"}, 

36 "przymiotnikowym": {"pos": "adj"}, 

37 "przymiotnikowa": {"pos": "adj_phrase"}, 

38 "przyrostek": {"pos": "suffix", "tags": ["morpheme"]}, 

39 "przyrostkowy": {"pos": "suffix", "tags": ["morpheme"]}, 

40 "przysłówek": {"pos": "adv"}, 

41 "przysłówkowa": {"pos": "adv_phrase"}, 

42 "pytajny": {"pos": "pron", "tags": ["interrogative"]}, # "zaimek pytajny" 

43 "rodzajnik": {"pos": "article", "tags": ["gendered"]}, 

44 "rzeczownik": {"pos": "noun"}, 

45 "rzeczownikowa": {"pos": "noun"}, 

46 "skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]}, 

47 "skrót": {"pos": "abbrev", "tags": ["abbreviation"]}, 

48 "spójnik": {"pos": "conj"}, 

49 "symbol": {"pos": "symbol"}, 

50 "wrostek": {"pos": "infix", "tags": ["morpheme"]}, 

51 "wykrzyknik": {"pos": "intj"}, 

52 "wykrzyknika": {"pos": "intj"}, 

53 "wykrzyknikowa": {"pos": "intj"}, 

54 "zaimka": {"pos": "pron"}, 

55 "zaimek": {"pos": "pron"}, 

56 "zaimkowy": {"pos": "pron"}, 

57 "znak interpunkcyjny": {"pos": "punct", "tags": ["punctuation"]}, 

58 "dopełniacz saksoński": {"pos": "unknown"}, 

59 "forma ściągnięta": { 

60 "pos": "contraction", 

61 "tags": ["contraction", "form-of"], 

62 }, 

63 "słowotwórczy": {"pos": "unknown", "tags": ["morpheme"]}, 

64 "liczebnik porządkowy": {"pos": "adj", "tags": ["ordinal"]}, 

65 "liczebnik główny": {"pos": "adj", "tags": ["cardinal"]}, 

66 "litera": {"pos": "character", "tags": ["letter"]}, 

67 "związek frazeologiczny": {"pos": "phrase", "tags": ["idiomatic"]}, 

68 "związek wyrazów": {"pos": "unknown"}, 

69 "sentencja łacińska": {"pos": "unknown"}, 

70 "imiesłów": {"pos": "verb", "tags": ["participle"]}, 

71 "postpozycja": {"pos": "postp"}, 

72 "zwrot": {"pos": "phrase"}, 

73 "słowo pomocnicze": {"pos": "unknown"}, 

74 "wyrażenie": {"pos": "phrase"}, 

75 "czasownik frazowy": {"pos": "verb", "tags": ["phrasal"]}, 

76 "zaimek osobowy": {"pos": "pron", "tags": ["person"]}, 

77 "zaimek pytajny": {"pos": "pron", "tags": ["interrogative"]}, 

78 "zbitka": {"pos": "unknown"}, 

79 "nazwa własna": {"pos": "name"}, 

80 "rzeczownik odczasownikowy": { 

81 "pos": "verb", 

82 "tags": ["participle", "gerund"], 

83 }, 

84} 

85 

86# Category:Proverb Templates 

87# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów 

88POS_PREFIXES = { 

89 "przysłowie": {"pos": "proverb"}, 

90 "sentencja": {"pos": "phrase"}, 

91} 

92 

93IGNORE_POS_LINE_TEXT = frozenset(["rodzaj", "lub"]) 

94 

95 

96def extract_pos_section( 

97 wxr: WiktextractContext, 

98 page_data: list[WordEntry], 

99 base_data: WordEntry, 

100 level_node: LevelNode, 

101) -> None: 

102 has_pos = False 

103 last_node_is_list = True 

104 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST): 

105 if node.kind == NodeKind.ITALIC: 

106 if last_node_is_list: 

107 new_has_pos = process_pos_line_italic_node( 

108 wxr, page_data, base_data, node 

109 ) 

110 if new_has_pos: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true

111 has_pos = True 

112 last_node_is_list = False 

113 elif node.kind == NodeKind.LIST: 113 ↛ 104line 113 didn't jump to line 104 because the condition on line 113 was always true

114 if not has_pos: 

115 page_data.append(base_data.model_copy(deep=True)) 

116 for list_item in node.find_child(NodeKind.LIST_ITEM): 

117 process_gloss_list_item(wxr, page_data[-1], list_item) 

118 last_node_is_list = True 

119 

120 

121def process_pos_line_italic_node( 

122 wxr: WiktextractContext, 

123 page_data: list[WordEntry], 

124 base_data: WordEntry, 

125 italic_node: WikiNode, 

126) -> bool: 

127 has_pos = False 

128 page_data.append(base_data.model_copy(deep=True)) 

129 for child in italic_node.children: 

130 if isinstance(child, TemplateNode): 

131 child_text = clean_node(wxr, page_data[-1], child) 

132 if child.template_name.startswith("forma "): 132 ↛ 140line 132 didn't jump to line 140 because the condition on line 132 was always true

133 # inflection form header templates 

134 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych 

135 pos_text = child_text.split(", ")[0] 

136 if pos_text in POS_DATA: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true

137 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text]) 

138 has_pos = True 

139 page_data[-1].tags.append("form-of") 

140 elif child_text in POS_DATA: 

141 update_pos_data(page_data[-1], child_text, POS_DATA[child_text]) 

142 has_pos = True 

143 else: 

144 for prefix, pos_data in POS_PREFIXES.items(): 

145 if child_text.startswith(prefix): 

146 update_pos_data(page_data[-1], child_text, pos_data) 

147 has_pos = True 

148 break 

149 if not has_pos: 

150 for text in child_text.split(): 

151 if text in POS_DATA: 

152 update_pos_data(page_data[-1], text, POS_DATA[text]) 

153 has_pos = True 

154 break 

155 if not has_pos and child_text not in IGNORE_POS_LINE_TEXT: 

156 page_data[-1].raw_tags.append(child_text) 

157 elif isinstance(child, str): 157 ↛ 129line 157 didn't jump to line 129 because the condition on line 157 was always true

158 if child.strip() in POS_DATA: 

159 child = child.strip() 

160 update_pos_data(page_data[-1], child, POS_DATA[child]) 

161 has_pos = True 

162 else: 

163 for text in child.strip(", ").split(","): 

164 text = text.strip() 

165 if text in POS_DATA: 

166 update_pos_data(page_data[-1], text, POS_DATA[text]) 

167 has_pos = True 

168 elif text in TAGS: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 page_data[-1].raw_tags.append(text) 

170 else: 

171 for t in text.split(): 

172 if t in POS_DATA: 

173 update_pos_data(page_data[-1], t, POS_DATA[t]) 

174 has_pos = True 

175 elif t not in IGNORE_POS_LINE_TEXT: 

176 page_data[-1].raw_tags.append(t) 

177 translate_raw_tags(page_data[-1]) 

178 if not has_pos: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 page_data.pop() 

180 return has_pos 

181 

182 

183def update_pos_data( 

184 word_entry: WordEntry, pos_text: str, pos_data: dict 

185) -> None: 

186 word_entry.pos = pos_data["pos"] 

187 word_entry.tags.extend(pos_data.get("tags", [])) 

188 word_entry.pos_text = pos_text 

189 

190 

191def process_gloss_list_item( 

192 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

193) -> None: 

194 sense = Sense() 

195 gloss_nodes = [] 

196 raw_tags = [] 

197 for gloss_node in list_item_node.children: 

198 if isinstance(gloss_node, TemplateNode): 

199 if gloss_node.template_name == "wikipedia": 

200 continue 

201 process_form_of_template(wxr, sense, gloss_node) 

202 expanded_node = wxr.wtp.parse( 

203 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

204 ) 

205 expanded_text = clean_node(wxr, sense, expanded_node.children) 

206 if ( 

207 expanded_text.endswith(".") 

208 and len(gloss_node.template_parameters) == 0 

209 ): 

210 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku 

211 raw_tags.append(expanded_text) 

212 else: 

213 gloss_nodes.extend(expanded_node.children) 

214 else: 

215 gloss_nodes.append(gloss_node) 

216 gloss_text = clean_node(wxr, sense, gloss_nodes) 

217 m = re.match(r"\(\d+\.\d+\)", gloss_text) 

218 sense_index = "" 

219 if m is not None: 219 ↛ 222line 219 didn't jump to line 222 because the condition on line 219 was always true

220 sense_index = m.group(0).strip("()") 

221 gloss_text = gloss_text[m.end() :].strip("=; ") 

222 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

223 form_of = "" 

224 for node in gloss_nodes: 

225 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

226 form_of = clean_node(wxr, None, node) 

227 if len(form_of) > 0: 227 ↛ 229line 227 didn't jump to line 229 because the condition on line 227 was always true

228 sense.form_of.append(AltForm(word=form_of)) 

229 if len(gloss_text) > 0: 229 ↛ exitline 229 didn't return from function 'process_gloss_list_item' because the condition on line 229 was always true

230 sense.raw_tags = raw_tags 

231 sense.sense_index = sense_index 

232 sense.glosses.append(gloss_text) 

233 translate_raw_tags(sense) 

234 word_entry.senses.append(sense) 

235 

236 

237def process_form_of_template( 

238 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode 

239) -> None: 

240 if template_node.template_name == "zob-ekwiw-pupr": 

241 if "form-of" not in sense.tags: 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was always true

242 sense.tags.append("form-of") 

243 word = clean_node( 

244 wxr, None, template_node.template_parameters.get(1, "") 

245 ) 

246 sense.form_of.append(AltForm(word=word))