Coverage for src/wiktextract/extractor/id/pos.py: 82%

133 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 HTMLNode, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from .example import extract_example_list_item 

13from .models import AltForm, Attestation, Example, Form, Sense, WordEntry 

14from .section_titles import POS_DATA 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pos_section( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 base_data: WordEntry, 

22 level_node: LevelNode, 

23 pos_title: str, 

24) -> None: 

25 page_data.append(base_data.model_copy(deep=True)) 

26 page_data[-1].pos_title = pos_title 

27 pos_data = POS_DATA[pos_title] 

28 page_data[-1].pos = pos_data["pos"] 

29 page_data[-1].tags.extend(pos_data.get("tags", [])) 

30 

31 gloss_list_index = len(level_node.children) 

32 for index, node in enumerate(level_node.children): 

33 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

34 for list_item in node.find_child(NodeKind.LIST_ITEM): 

35 if node.sarg.startswith("#") and node.sarg.endswith("#"): 35 ↛ 34line 35 didn't jump to line 34 because the condition on line 35 was always true

36 extract_gloss_list_item(wxr, page_data[-1], list_item) 

37 if index < gloss_list_index: 37 ↛ 34line 37 didn't jump to line 34 because the condition on line 37 was always true

38 gloss_list_index = index 

39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 51line 39 didn't jump to line 51 because the condition on line 39 was never true

40 "lihat 2", 

41 "lihat ulang", 

42 "lihat v", 

43 "lihat2 a", 

44 "lihat2 adv", 

45 "lihat v ber2", 

46 "lihat n", 

47 "lihat 2 an", 

48 "lihat v ter2", 

49 "lihat2 v", 

50 ]: 

51 extract_lihat_2_template(wxr, page_data[-1], node) 

52 

53 process_pos_header_nodes( 

54 wxr, page_data[-1], level_node.children[:gloss_list_index] 

55 ) 

56 

57 

58def extract_gloss_list_item( 

59 wxr: WiktextractContext, 

60 word_entry: WordEntry, 

61 list_item: WikiNode, 

62 parent_sense: Sense | None = None, 

63) -> None: 

64 sense = ( 

65 parent_sense.model_copy(deep=True) 

66 if parent_sense is not None 

67 else Sense() 

68 ) 

69 gloss_nodes = [] 

70 after_br_tag = False 

71 for node in list_item.children: 

72 if isinstance(node, TemplateNode) and node.template_name.startswith( 

73 "variasi" 

74 ): 

75 extract_variasi_template(wxr, sense, node) 

76 elif isinstance(node, TemplateNode) and node.template_name == "defdate": 

77 extract_defdate_template(wxr, sense, node) 

78 elif isinstance(node, TemplateNode): 

79 expanded = clean_node(wxr, sense, node) 

80 if expanded.startswith("(") and expanded.strip().endswith( 80 ↛ 88line 80 didn't jump to line 88 because the condition on line 80 was always true

81 (")", ") ·") 

82 ): 

83 for raw_tag in expanded.split("·"): 

84 raw_tag = raw_tag.strip("() ") 

85 if raw_tag != "": 

86 sense.raw_tags.append(raw_tag) 

87 else: 

88 gloss_nodes.append(expanded) 

89 elif ( 

90 isinstance(node, HTMLNode) and node.tag == "br" and not after_br_tag 

91 ): 

92 after_br_tag = True 

93 elif ( 

94 isinstance(node, WikiNode) 

95 and node.kind == NodeKind.ITALIC 

96 and after_br_tag 

97 ): 

98 e_str = clean_node(wxr, None, node) 

99 if e_str != "": 99 ↛ 71line 99 didn't jump to line 71 because the condition on line 99 was always true

100 sense.examples.append(Example(text=e_str)) 

101 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

102 gloss_nodes.append(node) 

103 

104 gloss_str = clean_node(wxr, sense, gloss_nodes) 

105 if gloss_str != "": 

106 sense.glosses.append(gloss_str) 

107 if gloss_str.startswith("bentuk "): 

108 find_form_of_link(wxr, sense, gloss_nodes) 

109 

110 if len(sense.glosses) > 0: 110 ↛ 114line 110 didn't jump to line 114 because the condition on line 110 was always true

111 translate_raw_tags(sense) 

112 word_entry.senses.append(sense) 

113 

114 for child_list in list_item.find_child(NodeKind.LIST): 

115 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

116 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

117 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

118 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 118 ↛ 114line 118 didn't jump to line 114 because the condition on line 118 was always true

119 (":", "*") 

120 ): 

121 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

122 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

123 

124 

125def extract_lihat_2_template( 

126 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

127) -> None: 

128 # https://id.wiktionary.org/wiki/Templat:lihat_2 

129 expanded_template = wxr.wtp.parse( 

130 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

131 ) 

132 for list_node in expanded_template.find_child(NodeKind.LIST): 

133 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

134 sense = Sense() 

135 gloss_str = clean_node(wxr, sense, list_item.children) 

136 if "⇢" in gloss_str: 

137 sense.glosses.append( 

138 gloss_str[gloss_str.index("⇢") + 1 :].strip() 

139 ) 

140 if ")" in gloss_str: 

141 sense.raw_tags.append( 

142 gloss_str[: gloss_str.index(")")].strip("( ") 

143 ) 

144 if len(sense.glosses) > 0: 

145 word_entry.senses.append(sense) 

146 

147 

148def process_pos_header_nodes( 

149 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

150) -> None: 

151 raw_tag = "" 

152 after_bold_node = False 

153 for node in nodes: 

154 if isinstance(node, WikiNode): 

155 if node.kind == NodeKind.BOLD: 

156 after_bold_node = True 

157 elif ( 

158 node.kind == NodeKind.LINK 

159 and after_bold_node 

160 and clean_node(wxr, None, node) != "" 

161 and len(node.largs) > 0 

162 ): 

163 word = clean_node(wxr, None, node.largs[0]) 

164 if word != "": 164 ↛ 153line 164 didn't jump to line 153 because the condition on line 164 was always true

165 form = Form(form=word) 

166 if raw_tag != "": 166 ↛ 169line 166 didn't jump to line 169 because the condition on line 166 was always true

167 form.raw_tags.append(raw_tag) 

168 translate_raw_tags(form) 

169 word_entry.forms.append(form) 

170 elif isinstance(node, str) and node.strip().endswith(":"): 

171 raw_tag = node.strip("():;, ") 

172 

173 

174def extract_variasi_template( 

175 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

176) -> None: 

177 for index in range(1, 4): 

178 word = clean_node(wxr, None, t_node.template_parameters.get(index, "")) 

179 if word != "": 

180 sense.alt_of.append(AltForm(word=word)) 

181 gloss = clean_node(wxr, sense, t_node) 

182 if gloss != "": 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was always true

183 sense.glosses.append(gloss) 

184 sense.tags.append("alt-of") 

185 

186 

187def find_form_of_link( 

188 wxr: WiktextractContext, sense: Sense, gloss_nodes: list[WikiNode | str] 

189) -> None: 

190 # pre-expanded "nomina *", "imbuhan *", "ulang *", "verba *" templates 

191 form_of = "" 

192 for node in gloss_nodes: 

193 if isinstance(node, WikiNode): 193 ↛ 192line 193 didn't jump to line 192 because the condition on line 193 was always true

194 if node.kind == NodeKind.LINK: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 form_of = clean_node(wxr, None, node) 

196 elif node.kind == NodeKind.ITALIC: 196 ↛ 192line 196 didn't jump to line 192 because the condition on line 196 was always true

197 for link in node.find_child(NodeKind.LINK): 

198 form_of = clean_node(wxr, None, link) 

199 

200 if form_of != "": 200 ↛ exitline 200 didn't return from function 'find_form_of_link' because the condition on line 200 was always true

201 sense.form_of.append(AltForm(word=form_of)) 

202 sense.tags.append("form-of") 

203 

204 

205def extract_usage_section( 

206 wxr: WiktextractContext, word_entry: WordEntry, section_node: LevelNode 

207) -> None: 

208 non_list_nodes = [] 

209 for node in section_node.children: 

210 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

211 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM): 

212 note = clean_node( 

213 wxr, 

214 word_entry, 

215 list(list_item.invert_find_child(NodeKind.LIST)), 

216 ) 

217 if note != "": 217 ↛ 211line 217 didn't jump to line 211 because the condition on line 217 was always true

218 word_entry.notes.append(note) 

219 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 break 

221 else: 

222 non_list_nodes.append(node) 

223 

224 note = clean_node(wxr, word_entry, non_list_nodes) 

225 if note != "": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 word_entry.notes.append(note) 

227 

228 

229def extract_defdate_template( 

230 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

231): 

232 expanded_node = wxr.wtp.parse( 

233 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

234 ) 

235 date = clean_node(wxr, None, expanded_node).strip("[]") 

236 if date != "": 236 ↛ exitline 236 didn't return from function 'extract_defdate_template' because the condition on line 236 was always true

237 sense.attestations.append(Attestation(date=date))