Coverage for src/wiktextract/extractor/ku/translation.py: 63%

87 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Translation, WordEntry 

15 

16 

17def is_translation_page(title: str) -> bool: 

18 return re.search(r"/Werger(?:\d+)?$", title) is not None 

19 

20 

21def extract_translation_section( 

22 wxr: WiktextractContext, 

23 word_entry: WordEntry, 

24 level_node: LevelNode, 

25 source: str = "", 

26 tags: list[str] = [], 

27) -> None: 

28 sense = "" 

29 sense_index = 0 

30 for node in level_node.find_child( 

31 NodeKind.LIST | NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.BOLD 

32 ): 

33 if ( 

34 isinstance(node, TemplateNode) 

35 and node.template_name == "werger-ser" 

36 ): 

37 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

38 sense_i_str = clean_node( 

39 wxr, None, node.template_parameters.get(2, "") 

40 ) 

41 if re.fullmatch(r"\d+", sense_i_str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 sense_index = int(sense_i_str) 

43 elif node.kind == NodeKind.LIST: 

44 for list_item in node.find_child(NodeKind.LIST_ITEM): 

45 extract_translation_list_item( 

46 wxr, 

47 word_entry, 

48 list_item, 

49 sense, 

50 sense_index, 

51 source, 

52 tags=tags, 

53 ) 

54 elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 for link_node in node.find_child(NodeKind.LINK): 

56 link_str = clean_node(wxr, None, link_node) 

57 if is_translation_page(link_str): 

58 extract_translation_page(wxr, word_entry, link_str) 

59 elif ( 59 ↛ 63line 59 didn't jump to line 63 because the condition on line 59 was never true

60 isinstance(node, TemplateNode) 

61 and node.template_name == "werger-bnr" 

62 ): 

63 page_title = clean_node( 

64 wxr, None, node.template_parameters.get(1, "") 

65 ) 

66 if is_translation_page(page_title): 

67 extract_translation_page(wxr, word_entry, page_title) 

68 

69 

70def extract_translation_list_item( 

71 wxr: WiktextractContext, 

72 word_entry: WordEntry, 

73 list_item: WikiNode, 

74 sense: str, 

75 sense_index: int, 

76 source: str, 

77 tags: list[str] = [], 

78) -> None: 

79 lang_name = "unknown" 

80 lang_code = "unknown" 

81 before_colon = True 

82 for index, node in enumerate(list_item.children): 

83 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

84 lang_name = clean_node( 

85 wxr, 

86 None, 

87 list_item.children[:index] + [node[: node.index(":")]], 

88 ) 

89 if lang_name == "": 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 lang_name = "unknown" 

91 before_colon = False 

92 elif isinstance(node, TemplateNode) and node.template_name == "Z": 

93 lang_code = clean_node( 

94 wxr, None, node.template_parameters.get(1, "") 

95 ) 

96 elif isinstance(node, TemplateNode) and node.template_name in [ 

97 "W", 

98 "W+", 

99 "W-", 

100 ]: 

101 extract_w_template( 

102 wxr, word_entry, node, sense, sense_index, lang_name, source 

103 ) 

104 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

106 extract_translation_list_item( 

107 wxr, 

108 word_entry, 

109 child_list_item, 

110 sense, 

111 sense_index, 

112 source, 

113 tags=tags, 

114 ) 

115 elif ( 

116 isinstance(node, WikiNode) 

117 and node.kind == NodeKind.LINK 

118 and not before_colon 

119 ): 

120 if lang_code in ["", "unknown"]: 

121 new_code = name_to_code(lang_name, "ku") 

122 if new_code != "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 lang_code = new_code 

124 tr_data = Translation( 

125 word=clean_node(wxr, None, node), 

126 lang=lang_name, 

127 lang_code=lang_code, 

128 sense=sense, 

129 sense_index=sense_index, 

130 source=source, 

131 tags=tags, 

132 ) 

133 if tr_data.word != "": 133 ↛ 82line 133 didn't jump to line 82 because the condition on line 133 was always true

134 word_entry.translations.append(tr_data) 

135 

136 

137def extract_w_template( 

138 wxr: WiktextractContext, 

139 word_entry: WordEntry, 

140 t_node: TemplateNode, 

141 sense: str, 

142 sense_index: int, 

143 lang_name: str, 

144 source: str, 

145 tags: list[str] = [], 

146) -> None: 

147 # https://ku.wiktionary.org/wiki/Şablon:W 

148 tr_data = Translation( 

149 lang=lang_name, 

150 lang_code=clean_node( 

151 wxr, None, t_node.template_parameters.get(1, "unknown") 

152 ), 

153 word=clean_node( 

154 wxr, 

155 None, 

156 t_node.template_parameters.get( 

157 "cuda", t_node.template_parameters.get(2, "") 

158 ), 

159 ), 

160 source=source, 

161 tags=tags, 

162 ) 

163 tag_args = { 

164 "n": "masculine", 

165 "m": "feminine", 

166 "f": "feminine", 

167 "nt": "gender-neutral", 

168 "mn": ["feminine", "masculine"], 

169 "g": "common-gender", 

170 "p": "plural", 

171 "y": "singular", 

172 } 

173 for tag_arg in [3, 4]: 

174 tag_arg_value = clean_node( 

175 wxr, None, t_node.template_parameters.get(tag_arg, "") 

176 ) 

177 if tag_arg_value in tag_args: 

178 tag = tag_args[tag_arg_value] 

179 if isinstance(tag, str): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was always true

180 tr_data.tags.append(tag) 

181 elif isinstance(tag, list): 

182 tr_data.tags.extend(tag) 

183 expanded_node = wxr.wtp.parse( 

184 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

185 ) 

186 for span_tag in expanded_node.find_html("span"): 186 ↛ 192line 186 didn't jump to line 192 because the loop on line 186 didn't complete

187 if "Latn" in span_tag.attrs.get("class", ""): 

188 roman = clean_node(wxr, None, span_tag) 

189 if roman not in ["", tr_data.word]: 189 ↛ 186line 189 didn't jump to line 186 because the condition on line 189 was always true

190 tr_data.roman = roman 

191 break 

192 if tr_data.word != "": 192 ↛ exitline 192 didn't return from function 'extract_w_template' because the condition on line 192 was always true

193 word_entry.translations.append(tr_data) 

194 

195 

196def extract_translation_page( 

197 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

198) -> None: 

199 page = wxr.wtp.get_page(page_title, 0) 

200 if page is None or page.body is None: 

201 return 

202 root = wxr.wtp.parse(page.body) 

203 for level2_node in root.find_child(NodeKind.LEVEL2): 

204 lang_name = clean_node(wxr, None, level2_node.largs) 

205 if lang_name != word_entry.lang: 

206 continue 

207 for child_level in level2_node.find_child_recursively(LEVEL_KIND_FLAGS): 

208 child_level_str = clean_node(wxr, None, child_level.largs) 

209 if child_level_str == "Werger": 

210 extract_translation_section( 

211 wxr, word_entry, child_level, page_title 

212 )