Coverage for src/wiktextract/extractor/pl/pos.py: 79%

125 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 06:55 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import AltForm, Attestation, Sense, WordEntry 

8from .section_titles import POS_DATA 

9from .tags import TAGS, translate_raw_tags 

10 

11# Category:Proverb Templates 

12# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów 

13POS_PREFIXES = { 

14 "przysłowie": {"pos": "proverb"}, 

15 "sentencja": {"pos": "phrase"}, 

16} 

17 

18IGNORE_POS_LINE_TEXT = frozenset(["rodzaj", "lub"]) 

19 

20 

21def extract_pos_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26) -> None: 

27 has_pos = False 

28 last_node_is_list = True 

29 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST): 

30 if node.kind == NodeKind.ITALIC: 

31 if last_node_is_list: 

32 new_has_pos = process_pos_line_italic_node( 

33 wxr, page_data, base_data, node 

34 ) 

35 if new_has_pos: 35 ↛ 37line 35 didn't jump to line 37 because the condition on line 35 was always true

36 has_pos = True 

37 last_node_is_list = False 

38 elif node.kind == NodeKind.LIST: 38 ↛ 29line 38 didn't jump to line 29 because the condition on line 38 was always true

39 if not has_pos: 

40 page_data.append(base_data.model_copy(deep=True)) 

41 for list_item in node.find_child(NodeKind.LIST_ITEM): 

42 process_gloss_list_item(wxr, page_data[-1], list_item) 

43 last_node_is_list = True 

44 

45 

46def process_pos_line_italic_node( 

47 wxr: WiktextractContext, 

48 page_data: list[WordEntry], 

49 base_data: WordEntry, 

50 italic_node: WikiNode, 

51) -> bool: 

52 has_pos = False 

53 page_data.append(base_data.model_copy(deep=True)) 

54 for child in italic_node.children: 

55 if isinstance(child, TemplateNode): 

56 child_text = clean_node(wxr, page_data[-1], child) 

57 if child.template_name.startswith("forma "): 57 ↛ 65line 57 didn't jump to line 65 because the condition on line 57 was always true

58 # inflection form header templates 

59 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych 

60 pos_text = child_text.split(", ")[0] 

61 if pos_text in POS_DATA: 61 ↛ 64line 61 didn't jump to line 64 because the condition on line 61 was always true

62 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text]) 

63 has_pos = True 

64 page_data[-1].tags.append("form-of") 

65 elif child_text in POS_DATA: 

66 update_pos_data(page_data[-1], child_text, POS_DATA[child_text]) 

67 has_pos = True 

68 else: 

69 for prefix, pos_data in POS_PREFIXES.items(): 

70 if child_text.startswith(prefix): 

71 update_pos_data(page_data[-1], child_text, pos_data) 

72 has_pos = True 

73 break 

74 if not has_pos: 

75 for text in child_text.split(): 

76 if text in POS_DATA: 

77 update_pos_data(page_data[-1], text, POS_DATA[text]) 

78 has_pos = True 

79 break 

80 if not has_pos and child_text not in IGNORE_POS_LINE_TEXT: 

81 page_data[-1].raw_tags.append(child_text) 

82 elif isinstance(child, str): 82 ↛ 54line 82 didn't jump to line 54 because the condition on line 82 was always true

83 if child.strip() in POS_DATA: 

84 child = child.strip() 

85 update_pos_data(page_data[-1], child, POS_DATA[child]) 

86 has_pos = True 

87 else: 

88 for text in child.strip(", ").split(","): 

89 text = text.strip() 

90 if text in POS_DATA: 

91 update_pos_data(page_data[-1], text, POS_DATA[text]) 

92 has_pos = True 

93 elif text in TAGS: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 page_data[-1].raw_tags.append(text) 

95 else: 

96 for t in text.split(): 

97 if t in POS_DATA: 

98 update_pos_data(page_data[-1], t, POS_DATA[t]) 

99 has_pos = True 

100 elif t not in IGNORE_POS_LINE_TEXT: 

101 page_data[-1].raw_tags.append(t) 

102 translate_raw_tags(page_data[-1]) 

103 if not has_pos: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 page_data.pop() 

105 return has_pos 

106 

107 

108def update_pos_data( 

109 word_entry: WordEntry, pos_text: str, pos_data: dict 

110) -> None: 

111 word_entry.pos = pos_data["pos"] 

112 word_entry.tags.extend(pos_data.get("tags", [])) 

113 word_entry.pos_title = pos_text 

114 

115 

116def process_gloss_list_item( 

117 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

118) -> None: 

119 sense = Sense() 

120 gloss_nodes = [] 

121 raw_tags = [] 

122 for gloss_node in list_item_node.children: 

123 if ( 

124 isinstance(gloss_node, TemplateNode) 

125 and gloss_node.template_name == "datadef" 

126 ): 

127 extract_datedef_template(wxr, sense, gloss_node) 

128 elif isinstance(gloss_node, TemplateNode): 

129 if gloss_node.template_name == "wikipedia": 

130 continue 

131 process_form_of_template(wxr, sense, gloss_node) 

132 expanded_node = wxr.wtp.parse( 

133 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

134 ) 

135 expanded_text = clean_node(wxr, sense, expanded_node.children) 

136 if ( 

137 expanded_text.endswith(".") 

138 and len(gloss_node.template_parameters) == 0 

139 ): 

140 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku 

141 raw_tags.append(expanded_text) 

142 else: 

143 gloss_nodes.extend(expanded_node.children) 

144 else: 

145 gloss_nodes.append(gloss_node) 

146 gloss_text = clean_node(wxr, sense, gloss_nodes) 

147 m = re.match(r"\(\d+\.\d+\)", gloss_text) 

148 sense_index = "" 

149 if m is not None: 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true

150 sense_index = m.group(0).strip("()") 

151 gloss_text = gloss_text[m.end() :].strip("=; ") 

152 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

153 form_of = "" 

154 for node in gloss_nodes: 

155 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

156 form_of = clean_node(wxr, None, node) 

157 if len(form_of) > 0: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true

158 sense.form_of.append(AltForm(word=form_of)) 

159 if len(gloss_text) > 0: 159 ↛ exitline 159 didn't return from function 'process_gloss_list_item' because the condition on line 159 was always true

160 sense.raw_tags = raw_tags 

161 sense.sense_index = sense_index 

162 sense.glosses.append(gloss_text) 

163 translate_raw_tags(sense) 

164 word_entry.senses.append(sense) 

165 

166 

167def process_form_of_template( 

168 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode 

169) -> None: 

170 if template_node.template_name == "zob-ekwiw-pupr": 

171 if "form-of" not in sense.tags: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 sense.tags.append("form-of") 

173 word = clean_node( 

174 wxr, None, template_node.template_parameters.get(1, "") 

175 ) 

176 sense.form_of.append(AltForm(word=word)) 

177 

178 

179def extract_datedef_template( 

180 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

181): 

182 date = clean_node(wxr, None, t_node).strip("[] ") 

183 if date != "": 183 ↛ exitline 183 didn't return from function 'extract_datedef_template' because the condition on line 183 was always true

184 sense.attestations.append(Attestation(date=date))