Coverage for src/wiktextract/extractor/ku/translation.py: 63%

1import re

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .models import Translation, WordEntry

17def is_translation_page(title: str) -> bool:

18 return re.search(r"/Werger(?:\d+)?$", title) is not None

21def extract_translation_section(

22 wxr: WiktextractContext,

23 word_entry: WordEntry,

24 level_node: LevelNode,

25 source: str = "",

26 tags: list[str] = [],

27) -> None:

28 sense = ""

29 sense_index = 0

30 for node in level_node.find_child(

31 NodeKind.LIST | NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.BOLD

32 ):

33 if (

34 isinstance(node, TemplateNode)

35 and node.template_name == "werger-ser"

36 ):

37 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))

38 sense_i_str = clean_node(

39 wxr, None, node.template_parameters.get(2, "")

40 )

41 if re.fullmatch(r"\d+", sense_i_str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 sense_index = int(sense_i_str)

43 elif node.kind == NodeKind.LIST:

44 for list_item in node.find_child(NodeKind.LIST_ITEM):

45 extract_translation_list_item(

46 wxr,

47 word_entry,

48 list_item,

49 sense,

50 sense_index,

51 source,

52 tags=tags,

53 )

54 elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 for link_node in node.find_child(NodeKind.LINK):

56 link_str = clean_node(wxr, None, link_node)

57 if is_translation_page(link_str):

58 extract_translation_page(wxr, word_entry, link_str)

59 elif ( 59 ↛ 63line 59 didn't jump to line 63 because the condition on line 59 was never true

60 isinstance(node, TemplateNode)

61 and node.template_name == "werger-bnr"

62 ):

63 page_title = clean_node(

64 wxr, None, node.template_parameters.get(1, "")

65 )

66 if is_translation_page(page_title):

67 extract_translation_page(wxr, word_entry, page_title)

70def extract_translation_list_item(

71 wxr: WiktextractContext,

72 word_entry: WordEntry,

73 list_item: WikiNode,

74 sense: str,

75 sense_index: int,

76 source: str,

77 tags: list[str] = [],

78) -> None:

79 lang_name = "unknown"

80 lang_code = "unknown"

81 before_colon = True

82 for index, node in enumerate(list_item.children):

83 if isinstance(node, str) and ":" in node and lang_name == "unknown":

84 lang_name = clean_node(

85 wxr,

86 None,

87 list_item.children[:index] + [node[: node.index(":")]],

88 )

89 if lang_name == "": 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 lang_name = "unknown"

91 before_colon = False

92 elif isinstance(node, TemplateNode) and node.template_name == "Z":

93 lang_code = clean_node(

94 wxr, None, node.template_parameters.get(1, "")

95 )

96 elif isinstance(node, TemplateNode) and node.template_name in [

97 "W",

98 "W+",

99 "W-",

100 ]:

101 extract_w_template(

102 wxr, word_entry, node, sense, sense_index, lang_name, source

103 )

104 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

106 extract_translation_list_item(

107 wxr,

108 word_entry,

109 child_list_item,

110 sense,

111 sense_index,

112 source,

113 tags=tags,

114 )

115 elif (

116 isinstance(node, WikiNode)

117 and node.kind == NodeKind.LINK

118 and not before_colon

119 ):

120 if lang_code in ["", "unknown"]:

121 new_code = name_to_code(lang_name, "ku")

122 if new_code != "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 lang_code = new_code

124 tr_data = Translation(

125 word=clean_node(wxr, None, node),

126 lang=lang_name,

127 lang_code=lang_code,

128 sense=sense,

129 sense_index=sense_index,

130 source=source,

131 tags=tags,

132 )

133 if tr_data.word != "": 133 ↛ 82line 133 didn't jump to line 82 because the condition on line 133 was always true

134 word_entry.translations.append(tr_data)

135

136

137def extract_w_template(

138 wxr: WiktextractContext,

139 word_entry: WordEntry,

140 t_node: TemplateNode,

141 sense: str,

142 sense_index: int,

143 lang_name: str,

144 source: str,

145 tags: list[str] = [],

146) -> None:

147 # https://ku.wiktionary.org/wiki/Şablon:W

148 tr_data = Translation(

149 lang=lang_name,

150 lang_code=clean_node(

151 wxr, None, t_node.template_parameters.get(1, "unknown")

152 ),

153 word=clean_node(

154 wxr,

155 None,

156 t_node.template_parameters.get(

157 "cuda", t_node.template_parameters.get(2, "")

158 ),

159 ),

160 source=source,

161 tags=tags,

162 )

163 tag_args = {

164 "n": "masculine",

165 "m": "feminine",

166 "f": "feminine",

167 "nt": "gender-neutral",

168 "mn": ["feminine", "masculine"],

169 "g": "common-gender",

170 "p": "plural",

171 "y": "singular",

172 }

173 for tag_arg in [3, 4]:

174 tag_arg_value = clean_node(

175 wxr, None, t_node.template_parameters.get(tag_arg, "")

176 )

177 if tag_arg_value in tag_args:

178 tag = tag_args[tag_arg_value]

179 if isinstance(tag, str): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was always true

180 tr_data.tags.append(tag)

181 elif isinstance(tag, list):

182 tr_data.tags.extend(tag)

183 expanded_node = wxr.wtp.parse(

184 wxr.wtp.node_to_wikitext(t_node), expand_all=True

185 )

186 for span_tag in expanded_node.find_html("span"): 186 ↛ 192line 186 didn't jump to line 192 because the loop on line 186 didn't complete

187 if "Latn" in span_tag.attrs.get("class", ""):

188 roman = clean_node(wxr, None, span_tag)

189 if roman not in ["", tr_data.word]: 189 ↛ 186line 189 didn't jump to line 186 because the condition on line 189 was always true

190 tr_data.roman = roman

191 break

192 if tr_data.word != "": 192 ↛ exitline 192 didn't return from function 'extract_w_template' because the condition on line 192 was always true

193 word_entry.translations.append(tr_data)

194

195

196def extract_translation_page(

197 wxr: WiktextractContext, word_entry: WordEntry, page_title: str

198) -> None:

199 page = wxr.wtp.get_page(page_title, 0)

200 if page is None or page.body is None:

201 return

202 root = wxr.wtp.parse(page.body)

203 for level2_node in root.find_child(NodeKind.LEVEL2):

204 lang_name = clean_node(wxr, None, level2_node.largs)

205 if lang_name != word_entry.lang:

206 continue

207 for child_level in level2_node.find_child_recursively(LEVEL_KIND_FLAGS):

208 child_level_str = clean_node(wxr, None, child_level.largs)

209 if child_level_str == "Werger":

210 extract_translation_section(

211 wxr, word_entry, child_level, page_title

212 )