Coverage for src/wiktextract/extractor/id/pos.py: 81%

126 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 HTMLNode, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from .example import extract_example_list_item 

13from .models import AltForm, Example, Form, Sense, WordEntry 

14from .section_titles import POS_DATA 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pos_section( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 base_data: WordEntry, 

22 level_node: LevelNode, 

23 pos_title: str, 

24) -> None: 

25 page_data.append(base_data.model_copy(deep=True)) 

26 page_data[-1].pos_title = pos_title 

27 pos_data = POS_DATA[pos_title] 

28 page_data[-1].pos = pos_data["pos"] 

29 page_data[-1].tags.extend(pos_data.get("tags", [])) 

30 

31 gloss_list_index = len(level_node.children) 

32 for index, node in enumerate(level_node.children): 

33 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

34 for list_item in node.find_child(NodeKind.LIST_ITEM): 

35 if node.sarg.startswith("#") and node.sarg.endswith("#"): 35 ↛ 34line 35 didn't jump to line 34 because the condition on line 35 was always true

36 extract_gloss_list_item(wxr, page_data[-1], list_item) 

37 if index < gloss_list_index: 37 ↛ 34line 37 didn't jump to line 34 because the condition on line 37 was always true

38 gloss_list_index = index 

39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 51line 39 didn't jump to line 51 because the condition on line 39 was never true

40 "lihat 2", 

41 "lihat ulang", 

42 "lihat v", 

43 "lihat2 a", 

44 "lihat2 adv", 

45 "lihat v ber2", 

46 "lihat n", 

47 "lihat 2 an", 

48 "lihat v ter2", 

49 "lihat2 v", 

50 ]: 

51 extract_lihat_2_template(wxr, page_data[-1], node) 

52 

53 process_pos_header_nodes( 

54 wxr, page_data[-1], level_node.children[:gloss_list_index] 

55 ) 

56 

57 

58def extract_gloss_list_item( 

59 wxr: WiktextractContext, 

60 word_entry: WordEntry, 

61 list_item: WikiNode, 

62 parent_sense: Sense | None = None, 

63) -> None: 

64 sense = ( 

65 parent_sense.model_copy(deep=True) 

66 if parent_sense is not None 

67 else Sense() 

68 ) 

69 gloss_nodes = [] 

70 after_br_tag = False 

71 for node in list_item.children: 

72 if isinstance(node, TemplateNode) and node.template_name.startswith( 

73 "variasi" 

74 ): 

75 extract_variasi_template(wxr, sense, node) 

76 elif isinstance(node, TemplateNode): 

77 expanded = clean_node(wxr, sense, node) 

78 if expanded.startswith("(") and expanded.strip().endswith( 78 ↛ 86line 78 didn't jump to line 86 because the condition on line 78 was always true

79 (")", ") ·") 

80 ): 

81 for raw_tag in expanded.split("·"): 

82 raw_tag = raw_tag.strip("() ") 

83 if raw_tag != "": 

84 sense.raw_tags.append(raw_tag) 

85 else: 

86 gloss_nodes.append(expanded) 

87 elif ( 

88 isinstance(node, HTMLNode) and node.tag == "br" and not after_br_tag 

89 ): 

90 after_br_tag = True 

91 elif ( 

92 isinstance(node, WikiNode) 

93 and node.kind == NodeKind.ITALIC 

94 and after_br_tag 

95 ): 

96 e_str = clean_node(wxr, None, node) 

97 if e_str != "": 97 ↛ 71line 97 didn't jump to line 71 because the condition on line 97 was always true

98 sense.examples.append(Example(text=e_str)) 

99 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

100 gloss_nodes.append(node) 

101 

102 gloss_str = clean_node(wxr, sense, gloss_nodes) 

103 if gloss_str != "": 

104 sense.glosses.append(gloss_str) 

105 if gloss_str.startswith("bentuk "): 

106 find_form_of_link(wxr, sense, gloss_nodes) 

107 

108 if len(sense.glosses) > 0: 108 ↛ 112line 108 didn't jump to line 112 because the condition on line 108 was always true

109 translate_raw_tags(sense) 

110 word_entry.senses.append(sense) 

111 

112 for child_list in list_item.find_child(NodeKind.LIST): 

113 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

115 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

116 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 116 ↛ 112line 116 didn't jump to line 112 because the condition on line 116 was always true

117 (":", "*") 

118 ): 

119 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

120 extract_example_list_item(wxr, word_entry, sense, e_list_item) 

121 

122 

123def extract_lihat_2_template( 

124 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

125) -> None: 

126 # https://id.wiktionary.org/wiki/Templat:lihat_2 

127 expanded_template = wxr.wtp.parse( 

128 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

129 ) 

130 for list_node in expanded_template.find_child(NodeKind.LIST): 

131 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

132 sense = Sense() 

133 gloss_str = clean_node(wxr, sense, list_item.children) 

134 if "⇢" in gloss_str: 

135 sense.glosses.append( 

136 gloss_str[gloss_str.index("⇢") + 1 :].strip() 

137 ) 

138 if ")" in gloss_str: 

139 sense.raw_tags.append( 

140 gloss_str[: gloss_str.index(")")].strip("( ") 

141 ) 

142 if len(sense.glosses) > 0: 

143 word_entry.senses.append(sense) 

144 

145 

146def process_pos_header_nodes( 

147 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str] 

148) -> None: 

149 raw_tag = "" 

150 after_bold_node = False 

151 for node in nodes: 

152 if isinstance(node, WikiNode): 

153 if node.kind == NodeKind.BOLD: 

154 after_bold_node = True 

155 elif ( 

156 node.kind == NodeKind.LINK 

157 and after_bold_node 

158 and clean_node(wxr, None, node) != "" 

159 and len(node.largs) > 0 

160 ): 

161 word = clean_node(wxr, None, node.largs[0]) 

162 if word != "": 162 ↛ 151line 162 didn't jump to line 151 because the condition on line 162 was always true

163 form = Form(form=word) 

164 if raw_tag != "": 164 ↛ 167line 164 didn't jump to line 167 because the condition on line 164 was always true

165 form.raw_tags.append(raw_tag) 

166 translate_raw_tags(form) 

167 word_entry.forms.append(form) 

168 elif isinstance(node, str) and node.strip().endswith(":"): 

169 raw_tag = node.strip("():;, ") 

170 

171 

172def extract_variasi_template( 

173 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

174) -> None: 

175 for index in range(1, 4): 

176 word = clean_node(wxr, None, t_node.template_parameters.get(index, "")) 

177 if word != "": 

178 sense.alt_of.append(AltForm(word=word)) 

179 gloss = clean_node(wxr, sense, t_node) 

180 if gloss != "": 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true

181 sense.glosses.append(gloss) 

182 sense.tags.append("alt-of") 

183 

184 

185def find_form_of_link( 

186 wxr: WiktextractContext, sense: Sense, gloss_nodes: list[WikiNode | str] 

187) -> None: 

188 # pre-expanded "nomina *", "imbuhan *", "ulang *", "verba *" templates 

189 form_of = "" 

190 for node in gloss_nodes: 

191 if isinstance(node, WikiNode): 191 ↛ 190line 191 didn't jump to line 190 because the condition on line 191 was always true

192 if node.kind == NodeKind.LINK: 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 form_of = clean_node(wxr, None, node) 

194 elif node.kind == NodeKind.ITALIC: 194 ↛ 190line 194 didn't jump to line 190 because the condition on line 194 was always true

195 for link in node.find_child(NodeKind.LINK): 

196 form_of = clean_node(wxr, None, link) 

197 

198 if form_of != "": 198 ↛ exitline 198 didn't return from function 'find_form_of_link' because the condition on line 198 was always true

199 sense.form_of.append(AltForm(word=form_of)) 

200 sense.tags.append("form-of") 

201 

202 

203def extract_usage_section( 

204 wxr: WiktextractContext, word_entry: WordEntry, section_node: LevelNode 

205) -> None: 

206 non_list_nodes = [] 

207 for node in section_node.children: 

208 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

209 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM): 

210 note = clean_node( 

211 wxr, 

212 word_entry, 

213 list(list_item.invert_find_child(NodeKind.LIST)), 

214 ) 

215 if note != "": 215 ↛ 209line 215 didn't jump to line 209 because the condition on line 215 was always true

216 word_entry.notes.append(note) 

217 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 217 ↛ 218line 217 didn't jump to line 218 because the condition on line 217 was never true

218 break 

219 else: 

220 non_list_nodes.append(node) 

221 

222 note = clean_node(wxr, word_entry, non_list_nodes) 

223 if note != "": 223 ↛ 224line 223 didn't jump to line 224 because the condition on line 223 was never true

224 word_entry.notes.append(note)