Coverage for src/wiktextract/extractor/pl/pos.py: 83%

96 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import AltForm, Sense, WordEntry 

8from .tags import translate_raw_tags 

9 

10# All POS categories 

11# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_wg_języków 

12# Polish POS 

13# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_języka_polskiego 

14POS_DATA = { 

15 "czasownik": {"pos": "verb"}, 

16 "czasownika": {"pos": "verb"}, 

17 # Szablon:szwedzki czasownik frazowy 

18 "czasownik frazowy (partikelverb)": {"pos": "verb", "tags": ["phrase"]}, 

19 "fraza": {"pos": "phrase"}, 

20 "klasyfikator": {"pos": "classifier"}, 

21 "liczebnik": {"pos": "num"}, 

22 "liczebnikowa": {"pos": "num"}, 

23 "międzyrostek": {"pos": "interfix", "tags": ["morpheme"]}, 

24 "morfem": {"pos": "unknown", "tags": ["morpheme"]}, 

25 "określnik": {"pos": "det"}, 

26 "partykuła": {"pos": "particle"}, 

27 "partykułowa": {"pos": "particle"}, 

28 # Szablon:phrasal verb 

29 "phrasal verb (czasownik frazowy)": {"pos": "verb", "tags": ["phrase"]}, 

30 "przedimek": {"pos": "article"}, 

31 "przedrostek": {"pos": "prefix", "tags": ["morpheme"]}, 

32 "przyimek": {"pos": "prep"}, 

33 "przyimkowa": {"pos": "prep_phrase"}, 

34 "przymiotnik": {"pos": "adj"}, 

35 "przymiotnikowym": {"pos": "adj"}, 

36 "przymiotnikowa": {"pos": "adj_phrase"}, 

37 "przyrostek": {"pos": "suffix", "tags": ["morpheme"]}, 

38 "przysłówek": {"pos": "adv"}, 

39 "przysłówkowa": {"pos": "adv_phrase"}, 

40 "pytajny": {"pos": "pron", "tags": ["interrogative"]}, # "zaimek pytajny" 

41 "rodzajnik": {"pos": "article", "tags": ["gendered"]}, 

42 "rzeczownik": {"pos": "noun"}, 

43 "rzeczownikowa": {"pos": "noun"}, 

44 "skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]}, 

45 "spójnik": {"pos": "conj"}, 

46 "symbol": {"pos": "symbol"}, 

47 "wrostek": {"pos": "infix", "tags": ["morpheme"]}, 

48 "wykrzyknik": {"pos": "intj"}, 

49 "wykrzyknika": {"pos": "intj"}, 

50 "wykrzyknikowa": {"pos": "intj"}, 

51 "zaimka": {"pos": "pron"}, 

52 "zaimek": {"pos": "pron"}, 

53 "zaimkowy": {"pos": "pron"}, 

54} 

55 

56# Category:Proverb Templates 

57# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów 

58POS_PREFIXES = { 

59 "przysłowie": {"pos": "proverb"}, 

60 "sentencja": {"pos": "phrase"}, 

61} 

62 

63IGNORE_POS_LINE_TEXT = frozenset(["rodzaj"]) 

64 

65 

66def extract_pos_section( 

67 wxr: WiktextractContext, 

68 page_data: list[WordEntry], 

69 base_data: WordEntry, 

70 level_node: LevelNode, 

71) -> None: 

72 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST): 

73 if node.kind == NodeKind.ITALIC: 

74 process_pos_line_italic_node(wxr, page_data, base_data, node) 

75 elif node.kind == NodeKind.LIST: 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true

76 for list_item in node.find_child(NodeKind.LIST_ITEM): 

77 if len(page_data) == 0: 

78 page_data.append(base_data.model_copy(deep=True)) 

79 process_gloss_list_item(wxr, page_data[-1], list_item) 

80 

81 

82def process_pos_line_italic_node( 

83 wxr: WiktextractContext, 

84 page_data: list[WordEntry], 

85 base_data: WordEntry, 

86 italic_node: WikiNode, 

87) -> None: 

88 has_pos = False 

89 page_data.append(base_data.model_copy(deep=True)) 

90 for child in italic_node.children: 

91 if isinstance(child, TemplateNode): 

92 child_text = clean_node(wxr, page_data[-1], child) 

93 if child.template_name.startswith("forma "): 93 ↛ 101line 93 didn't jump to line 101 because the condition on line 93 was always true

94 # inflection form header templates 

95 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych 

96 pos_text = child_text.split(", ")[0] 

97 if pos_text in POS_DATA: 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was always true

98 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text]) 

99 has_pos = True 

100 page_data[-1].tags.append("form-of") 

101 elif child_text in POS_DATA: 

102 update_pos_data(page_data[-1], child_text, POS_DATA[child_text]) 

103 has_pos = True 

104 else: 

105 is_pos = False 

106 for prefix, pos_data in POS_PREFIXES.items(): 

107 if child_text.startswith(prefix): 

108 update_pos_data(page_data[-1], child_text, pos_data) 

109 is_pos = True 

110 break 

111 if not is_pos and child_text not in IGNORE_POS_LINE_TEXT: 

112 page_data[-1].raw_tags.append(child_text) 

113 elif isinstance(child, str): 113 ↛ 90line 113 didn't jump to line 90 because the condition on line 113 was always true

114 for text in child.strip(", ").split(): 

115 text = text.strip(", ") 

116 if text in POS_DATA: 

117 update_pos_data(page_data[-1], text, POS_DATA[text]) 

118 has_pos = True 

119 elif text not in IGNORE_POS_LINE_TEXT: 

120 page_data[-1].raw_tags.append(text) 

121 translate_raw_tags(page_data[-1]) 

122 if not has_pos: 

123 page_data.pop() 

124 

125 

126def update_pos_data( 

127 word_entry: WordEntry, pos_text: str, pos_data: dict 

128) -> None: 

129 word_entry.pos = pos_data["pos"] 

130 word_entry.tags.extend(pos_data.get("tags", [])) 

131 word_entry.pos_text = pos_text 

132 

133 

134def process_gloss_list_item( 

135 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

136) -> None: 

137 sense = Sense() 

138 gloss_nodes = [] 

139 raw_tags = [] 

140 for gloss_node in list_item_node.children: 

141 if isinstance(gloss_node, TemplateNode): 

142 if gloss_node.template_name == "wikipedia": 

143 continue 

144 process_form_of_template(wxr, sense, gloss_node) 

145 expanded_node = wxr.wtp.parse( 

146 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

147 ) 

148 expanded_text = clean_node(wxr, sense, expanded_node.children) 

149 if ( 

150 expanded_text.endswith(".") 

151 and len(gloss_node.template_parameters) == 0 

152 ): 

153 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku 

154 raw_tags.append(expanded_text) 

155 else: 

156 gloss_nodes.extend(expanded_node.children) 

157 else: 

158 gloss_nodes.append(gloss_node) 

159 gloss_text = clean_node(wxr, sense, gloss_nodes) 

160 m = re.match(r"\(\d+\.\d+\)", gloss_text) 

161 sense_index = "" 

162 if m is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true

163 sense_index = m.group(0).strip("()") 

164 gloss_text = gloss_text[m.end() :].strip("=; ") 

165 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

166 form_of = "" 

167 for node in gloss_nodes: 

168 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

169 form_of = clean_node(wxr, None, node) 

170 if len(form_of) > 0: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true

171 sense.form_of.append(AltForm(word=form_of)) 

172 if len(gloss_text) > 0: 172 ↛ exitline 172 didn't return from function 'process_gloss_list_item' because the condition on line 172 was always true

173 sense.raw_tags = raw_tags 

174 sense.sense_index = sense_index 

175 sense.glosses.append(gloss_text) 

176 translate_raw_tags(sense) 

177 word_entry.senses.append(sense) 

178 

179 

180def process_form_of_template( 

181 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode 

182) -> None: 

183 if template_node.template_name == "zob-ekwiw-pupr": 

184 if "form-of" not in sense.tags: 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true

185 sense.tags.append("form-of") 

186 word = clean_node( 

187 wxr, None, template_node.template_parameters.get(1, "") 

188 ) 

189 sense.form_of.append(AltForm(word=word))