Coverage for src/wiktextract/extractor/ku/translation.py: 54%

104 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Translation, WordEntry 

15 

16 

17def is_translation_page(title: str) -> bool: 

18 return re.search(r"/Werger(?:\d+)?$", title) is not None 

19 

20 

21def extract_translation_section( 

22 wxr: WiktextractContext, 

23 word_entry: WordEntry, 

24 level_node: LevelNode, 

25 source: str = "", 

26 tags: list[str] = [], 

27 sense: str = "", 

28 from_trans_see: bool = False, 

29) -> None: 

30 sense_index = 0 

31 for node in level_node.find_child( 

32 NodeKind.LIST | NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.BOLD 

33 ): 

34 if ( 

35 isinstance(node, TemplateNode) 

36 and node.template_name == "werger-ser" 

37 and not (sense != "" and from_trans_see) 

38 ): 

39 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

40 sense_i_str = clean_node( 

41 wxr, None, node.template_parameters.get(2, "") 

42 ) 

43 if re.fullmatch(r"\d+", sense_i_str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 sense_index = int(sense_i_str) 

45 elif node.kind == NodeKind.LIST: 

46 for list_item in node.find_child(NodeKind.LIST_ITEM): 

47 extract_translation_list_item( 

48 wxr, 

49 word_entry, 

50 list_item, 

51 sense, 

52 sense_index, 

53 source, 

54 tags=tags, 

55 ) 

56 elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 for link_node in node.find_child(NodeKind.LINK): 

58 link_str = clean_node(wxr, None, link_node) 

59 if is_translation_page(link_str): 

60 extract_translation_page(wxr, word_entry, link_str) 

61 elif ( 61 ↛ 66line 61 didn't jump to line 66 because the condition on line 61 was never true

62 isinstance(node, TemplateNode) 

63 and node.template_name in ("werger-bnr", "bnr-werger") 

64 and not from_trans_see 

65 ): 

66 extract_trans_see_template(wxr, word_entry, node) 

67 

68 

69def extract_translation_list_item( 

70 wxr: WiktextractContext, 

71 word_entry: WordEntry, 

72 list_item: WikiNode, 

73 sense: str, 

74 sense_index: int, 

75 source: str, 

76 tags: list[str] = [], 

77) -> None: 

78 lang_name = "unknown" 

79 lang_code = "unknown" 

80 before_colon = True 

81 for index, node in enumerate(list_item.children): 

82 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

83 lang_name = clean_node( 

84 wxr, 

85 None, 

86 list_item.children[:index] + [node[: node.index(":")]], 

87 ) 

88 if lang_name == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 lang_name = "unknown" 

90 before_colon = False 

91 elif isinstance(node, TemplateNode) and node.template_name == "Z": 

92 lang_code = clean_node( 

93 wxr, None, node.template_parameters.get(1, "") 

94 ) 

95 elif isinstance(node, TemplateNode) and node.template_name in [ 

96 "W", 

97 "W+", 

98 "W-", 

99 ]: 

100 extract_w_template( 

101 wxr, word_entry, node, sense, sense_index, lang_name, source 

102 ) 

103 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

105 extract_translation_list_item( 

106 wxr, 

107 word_entry, 

108 child_list_item, 

109 sense, 

110 sense_index, 

111 source, 

112 tags=tags, 

113 ) 

114 elif ( 

115 isinstance(node, WikiNode) 

116 and node.kind == NodeKind.LINK 

117 and not before_colon 

118 ): 

119 if lang_code in ["", "unknown"]: 

120 new_code = name_to_code(lang_name, "ku") 

121 if new_code != "": 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true

122 lang_code = new_code 

123 tr_data = Translation( 

124 word=clean_node(wxr, None, node), 

125 lang=lang_name, 

126 lang_code=lang_code, 

127 sense=sense, 

128 sense_index=sense_index, 

129 source=source, 

130 tags=tags, 

131 ) 

132 if tr_data.word != "": 132 ↛ 81line 132 didn't jump to line 81 because the condition on line 132 was always true

133 word_entry.translations.append(tr_data) 

134 

135 

136def extract_w_template( 

137 wxr: WiktextractContext, 

138 word_entry: WordEntry, 

139 t_node: TemplateNode, 

140 sense: str, 

141 sense_index: int, 

142 lang_name: str, 

143 source: str, 

144 tags: list[str] = [], 

145) -> None: 

146 # https://ku.wiktionary.org/wiki/Şablon:W 

147 tr_data = Translation( 

148 lang=lang_name, 

149 lang_code=clean_node( 

150 wxr, None, t_node.template_parameters.get(1, "unknown") 

151 ), 

152 word=clean_node( 

153 wxr, 

154 None, 

155 t_node.template_parameters.get( 

156 "cuda", t_node.template_parameters.get(2, "") 

157 ), 

158 ), 

159 source=source, 

160 tags=tags, 

161 sense=sense, 

162 sense_index=sense_index, 

163 ) 

164 tag_args = { 

165 "n": "masculine", 

166 "m": "feminine", 

167 "f": "feminine", 

168 "nt": "gender-neutral", 

169 "mn": ["feminine", "masculine"], 

170 "g": "common-gender", 

171 "p": "plural", 

172 "y": "singular", 

173 } 

174 for tag_arg in [3, 4]: 

175 tag_arg_value = clean_node( 

176 wxr, None, t_node.template_parameters.get(tag_arg, "") 

177 ) 

178 if tag_arg_value in tag_args: 

179 tag = tag_args[tag_arg_value] 

180 if isinstance(tag, str): 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true

181 tr_data.tags.append(tag) 

182 elif isinstance(tag, list): 

183 tr_data.tags.extend(tag) 

184 expanded_node = wxr.wtp.parse( 

185 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

186 ) 

187 for span_tag in expanded_node.find_html("span"): 187 ↛ 193line 187 didn't jump to line 193 because the loop on line 187 didn't complete

188 if "Latn" in span_tag.attrs.get("class", ""): 

189 roman = clean_node(wxr, None, span_tag) 

190 if roman not in ["", tr_data.word]: 190 ↛ 187line 190 didn't jump to line 187 because the condition on line 190 was always true

191 tr_data.roman = roman 

192 break 

193 if tr_data.word != "": 193 ↛ exitline 193 didn't return from function 'extract_w_template' because the condition on line 193 was always true

194 word_entry.translations.append(tr_data) 

195 

196 

197def extract_translation_page( 

198 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

199) -> None: 

200 page = wxr.wtp.get_page(page_title, 0) 

201 if page is None or page.body is None: 

202 return 

203 root = wxr.wtp.parse(page.body) 

204 target_node = find_subpage_section(wxr, root, "Werger") 

205 if target_node is not None: 

206 extract_translation_section( 

207 wxr, word_entry, target_node, source=page_title 

208 ) 

209 

210 

211def extract_trans_see_template( 

212 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

213): 

214 # https://ku.wiktionary.org/wiki/Şablon:werger-bnr 

215 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

216 page_titles = [] 

217 if 2 in t_node.template_parameters: 

218 for index in range(2, 11): 

219 if index not in t_node.template_parameters: 

220 break 

221 page_titles.append( 

222 clean_node(wxr, None, t_node.template_parameters[index]) 

223 ) 

224 else: 

225 page_titles.append( 

226 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

227 ) 

228 for page_title in page_titles: 

229 if "#" in page_title: 

230 page_title = page_title[: page_title.index("#")] 

231 page = wxr.wtp.get_page(page_title) 

232 if page is None: 

233 return 

234 root = wxr.wtp.parse(page.body) 

235 target_node = find_subpage_section(wxr, root, "Werger") 

236 if target_node is not None: 

237 extract_translation_section( 

238 wxr, 

239 word_entry, 

240 target_node, 

241 source=page_title, 

242 sense=sense, 

243 from_trans_see=True, 

244 ) 

245 

246 

247def find_subpage_section( 

248 wxr: WiktextractContext, root: WikiNode, target_section: str 

249) -> WikiNode | None: 

250 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

251 section_title = clean_node(wxr, None, level_node.largs) 

252 if section_title == target_section: 

253 return level_node 

254 return None