Coverage for src/wiktextract/extractor/vi/pos.py: 67%

126 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .example import extract_example_list_item 

15from .models import AltForm, Form, Sense, WordEntry 

16from .section_titles import POS_DATA 

17from .tags import translate_raw_tags 

18 

19 

20def extract_pos_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: LevelNode, 

25 pos_title: str, 

26): 

27 page_data.append(base_data.model_copy(deep=True)) 

28 page_data[-1].pos_title = pos_title 

29 pos_data = POS_DATA[pos_title] 

30 page_data[-1].pos = pos_data["pos"] 

31 base_data.pos = pos_data["pos"] 

32 page_data[-1].tags.extend(pos_data.get("tags", [])) 

33 

34 gloss_list_index = len(level_node.children) 

35 for index, list_node in level_node.find_child(NodeKind.LIST, True): 

36 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

38 extract_gloss_list_item(wxr, page_data[-1], list_item) 

39 if index < gloss_list_index: 39 ↛ 36line 39 didn't jump to line 36 because the condition on line 39 was always true

40 gloss_list_index = index 

41 

42 for node in level_node.children[:gloss_list_index]: 

43 if isinstance(node, TemplateNode): 

44 extract_headword_template(wxr, page_data[-1], node) 

45 

46 

47# redirect 

48ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"]) 

49FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"]) 

50 

51 

52def extract_gloss_list_item( 

53 wxr: WiktextractContext, 

54 word_entry: WordEntry, 

55 list_item: WikiNode, 

56 parent_sense: Sense | None = None, 

57): 

58 sense = ( 

59 parent_sense.model_copy(deep=True) 

60 if parent_sense is not None 

61 else Sense() 

62 ) 

63 sense.examples.clear() 

64 gloss_nodes = [] 

65 for node in list_item.children: 

66 if isinstance(node, TemplateNode): 

67 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 extract_label_template(wxr, sense, node) 

69 elif node.template_name == "term": 

70 extract_term_template(wxr, sense, node) 

71 elif ( 71 ↛ 78line 71 didn't jump to line 78 because the condition on line 71 was always true

72 node.template_name.endswith((" of", "-of")) 

73 or node.template_name in ALT_OF_TEMPLATES 

74 or node.template_name in FORM_OF_TEMPLATES 

75 ): 

76 extract_form_of_template(wxr, sense, node) 

77 gloss_nodes.append(node) 

78 elif node.template_name == "@": 

79 extract_at_template(wxr, sense, node) 

80 else: 

81 gloss_nodes.append(node) 

82 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

83 gloss_nodes.append(node) 

84 gloss_str = clean_node(wxr, sense, gloss_nodes) 

85 if gloss_str != "": 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true

86 sense.glosses.append(gloss_str) 

87 translate_raw_tags(sense) 

88 word_entry.senses.append(sense) 

89 

90 for child_list in list_item.find_child(NodeKind.LIST): 

91 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

92 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

93 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

94 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 94 ↛ 90line 94 didn't jump to line 90 because the condition on line 94 was always true

95 (":", "*") 

96 ): 

97 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

98 extract_example_list_item( 

99 wxr, word_entry, sense, child_list_item 

100 ) 

101 

102 

103def extract_label_template( 

104 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

105): 

106 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn 

107 expanded_node = wxr.wtp.parse( 

108 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

109 ) 

110 for span_tag in expanded_node.find_html_recursively("span"): 

111 span_classes = span_tag.attrs.get("class", "").split() 

112 if "label-content" in span_classes: 

113 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

114 raw_tag = raw_tag.strip() 

115 if raw_tag != "": 

116 sense.raw_tags.append(raw_tag) 

117 clean_node(wxr, sense, expanded_node) 

118 

119 

120def extract_term_template( 

121 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

122): 

123 # https://vi.wiktionary.org/wiki/Bản_mẫu:term 

124 expanded_node = wxr.wtp.parse( 

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

126 ) 

127 for italic_node in expanded_node.find_child(NodeKind.ITALIC): 

128 raw_tag = clean_node(wxr, None, italic_node) 

129 if raw_tag != "": 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true

130 sense.raw_tags.append(raw_tag) 

131 

132 

133def extract_form_of_template( 

134 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

135): 

136 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ 

137 expanded_node = wxr.wtp.parse( 

138 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

139 ) 

140 form = AltForm(word="") 

141 for i_tag in expanded_node.find_html_recursively("i"): 141 ↛ 144line 141 didn't jump to line 144 because the loop on line 141 didn't complete

142 form.word = clean_node(wxr, None, i_tag) 

143 break 

144 for span_tag in expanded_node.find_html_recursively("span"): 144 ↛ 148line 144 didn't jump to line 148 because the loop on line 144 didn't complete

145 if "mention-tr" in span_tag.attrs.get("class", "").split(): 

146 form.roman = clean_node(wxr, None, span_tag) 

147 break 

148 is_alt_of = ( 

149 "alternative" in t_node.template_name 

150 or t_node.template_name in ALT_OF_TEMPLATES 

151 ) 

152 if form.word != "": 152 ↛ exitline 152 didn't return from function 'extract_form_of_template' because the condition on line 152 was always true

153 if is_alt_of: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 sense.alt_of.append(form) 

155 sense.tags.append("alt-of") 

156 else: 

157 sense.form_of.append(form) 

158 sense.tags.append("form-of") 

159 

160 

161def extract_at_template( 

162 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

163): 

164 # https://vi.wiktionary.org/wiki/Thể_loại:@ 

165 # obsolete template 

166 expanded_node = wxr.wtp.parse( 

167 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

168 ) 

169 for i_tag in expanded_node.find_html("i"): 

170 text = clean_node(wxr, None, i_tag) 

171 for raw_tag in re.split(r",|;", text): 

172 raw_tag = raw_tag.strip() 

173 if raw_tag != "": 

174 sense.raw_tags.append(raw_tag) 

175 

176 

177def extract_note_section( 

178 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

179): 

180 has_list = False 

181 for list_node in level_node.find_child(NodeKind.LIST): 

182 has_list = True 

183 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

184 note = clean_node(wxr, None, list_item.children) 

185 if note != "": 

186 word_entry.notes.append(note) 

187 if not has_list: 

188 note = clean_node( 

189 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

190 ) 

191 if note != "": 

192 word_entry.notes.append(note) 

193 

194 

195def extract_headword_template( 

196 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

197): 

198 raw_tag = "" 

199 expanded_node = wxr.wtp.parse( 

200 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

201 ) 

202 for node in expanded_node.find_child_recursively( 

203 NodeKind.ITALIC | NodeKind.HTML 

204 ): 

205 if node.kind == NodeKind.ITALIC: 

206 raw_tag = clean_node(wxr, None, node) 

207 elif ( 

208 isinstance(node, HTMLNode) 

209 and node.tag == "span" 

210 and "form-of" in node.attrs.get("class", "").split() 

211 ): 

212 form = Form(form=clean_node(wxr, None, node)) 

213 if raw_tag != "": 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true

214 form.raw_tags.append(raw_tag) 

215 translate_raw_tags(form) 

216 raw_tag = "" 

217 if form.form != "": 217 ↛ 202line 217 didn't jump to line 202 because the condition on line 217 was always true

218 word_entry.forms.append(form) 

219 

220 for link_node in expanded_node.find_child(NodeKind.LINK): 

221 clean_node(wxr, word_entry, link_node)