Coverage for src/wiktextract/extractor/th/translation.py: 57%

1from itertools import count

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .models import Translation, WordEntry

15from .section_titles import TRANSLATION_SECTIONS

16from .tags import translate_raw_tags

19def extract_translation_section(

20 wxr: WiktextractContext,

21 word_entry: WordEntry,

22 level_node: LevelNode,

23 sense: str = "",

24 from_trans_see: bool = False,

25 source: str = "",

26) -> None:

27 for node in level_node.children:

28 if (

29 isinstance(node, TemplateNode)

30 and node.template_name == "trans-top"

31 and not (sense != "" and from_trans_see)

32 ):

33 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))

34 clean_node(wxr, word_entry, node)

35 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

36 for list_item in node.find_child(NodeKind.LIST_ITEM):

37 extract_translation_list_item(

38 wxr, word_entry, list_item, sense, source

39 )

40 elif ( 40 ↛ 45line 40 didn't jump to line 45 because the condition on line 40 was never true

41 isinstance(node, TemplateNode)

42 and node.template_name == "trans-see"

43 and not from_trans_see

44 ):

45 extract_trans_see_template(wxr, word_entry, node)

48def extract_translation_list_item(

49 wxr: WiktextractContext,

50 word_entry: WordEntry,

51 list_item: WikiNode,

52 sense: str,

53 source: str,

54) -> None:

55 lang_name = "unknown"

56 lang_code = "unknown"

57 for index, node in enumerate(list_item.children):

58 if isinstance(node, str) and ":" in node and lang_name == "unknown":

59 lang_name = (

60 clean_node(wxr, None, list_item.children[:index])

61 + node[: node.index(":")].strip()

62 )

63 if lang_name == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 lang_name = "unknown"

65 if lang_name != "unknown": 65 ↛ 57line 65 didn't jump to line 57 because the condition on line 65 was always true

66 lang_code = name_to_code(lang_name, "th")

67 if lang_code == "": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 lang_code = "unknown"

69 elif isinstance(node, TemplateNode) and node.template_name in [

70 "t",

71 "t+",

72 "t-simple",

73 ]:

74 extract_t_template(wxr, word_entry, node, lang_name, sense, source)

75 elif (

76 isinstance(node, WikiNode)

77 and node.kind == NodeKind.LINK

78 and lang_name != "unknown"

79 ):

80 word = clean_node(wxr, None, node)

81 if word != "": 81 ↛ 57line 81 didn't jump to line 57 because the condition on line 81 was always true

82 word_entry.translations.append(

83 Translation(

84 word=word,

85 lang=lang_name,

86 lang_code=lang_code,

87 sense=sense,

88 source=source,

89 )

90 )

91 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

92 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

93 extract_translation_list_item(

94 wxr, word_entry, child_list_item, sense, source

95 )

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 for link_node in node.find_child(NodeKind.LINK):

98 link_str = clean_node(wxr, None, link_node)

99 if link_str.endswith("/คำแปลภาษาอื่น"):

100 extract_translation_subpage(wxr, word_entry, link_str)

101

102

103def extract_t_template(

104 wxr: WiktextractContext,

105 word_entry: WordEntry,

106 t_node: TemplateNode,

107 lang_name: str,

108 sense: str,

109 source: str,

110) -> None:

111 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

112 if lang_code == "": 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 lang_code = "unknown"

114 tr_data = Translation(

115 word="", lang=lang_name, lang_code=lang_code, sense=sense, source=source

116 )

117 expanded_node = wxr.wtp.parse(

118 wxr.wtp.node_to_wikitext(t_node), expand_all=True

119 )

120 for span_tag in expanded_node.find_html_recursively("span"):

121 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":

122 tr_data.word = clean_node(wxr, None, span_tag)

123 else:

124 span_class = span_tag.attrs.get("class", "")

125 if "Latn" in span_class:

126 tr_data.roman = clean_node(wxr, None, span_tag)

127

128 tr_data.lit = clean_node(

129 wxr, None, t_node.template_parameters.get("lit", "")

130 )

131 for abbr_tag in expanded_node.find_html_recursively("abbr"):

132 tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag))

133

134 if tr_data.word != "": 134 ↛ exitline 134 didn't return from function 'extract_t_template' because the condition on line 134 was always true

135 translate_raw_tags(tr_data)

136 word_entry.translations.append(tr_data)

137 for link_node in expanded_node.find_child(NodeKind.LINK):

138 clean_node(wxr, word_entry, link_node)

139

140

141def extract_translation_subpage(

142 wxr: WiktextractContext, word_entry: WordEntry, page_title: str

143) -> None:

144 page = wxr.wtp.get_page(page_title, 0)

145 if page is None or page.body is None:

146 return

147 root = wxr.wtp.parse(page.body)

148 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)

149 if target_node is not None:

150 extract_translation_section(

151 wxr, word_entry, target_node, source=page_title

152 )

153

154

155def extract_trans_see_template(

156 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

157):

158 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

159 page_titles = []

160 if 2 in t_node.template_parameters:

161 for index in count(2):

162 if index not in t_node.template_parameters:

163 break

164 page_titles.append(

165 clean_node(wxr, None, t_node.template_parameters[index])

166 )

167 else:

168 page_titles.append(sense)

169 for page_title in page_titles:

170 if "#" in page_title:

171 page_title = page_title[: page_title.index("#")]

172 page = wxr.wtp.get_page(page_title)

173 if page is None:

174 return

175 root = wxr.wtp.parse(page.body)

176 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)

177 if target_node is not None:

178 extract_translation_section(

179 wxr,

180 word_entry,

181 target_node,

182 sense=sense,

183 from_trans_see=True,

184 source=page_title,

185 )

186

187

188def find_subpage_section(

189 wxr: WiktextractContext, root: WikiNode, target_sections: tuple[str, ...]

190) -> WikiNode | None:

191 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):

192 section_title = clean_node(wxr, None, level_node.largs)

193 if section_title in target_sections:

194 return level_node

195 return None

Coverage for src / wiktextract / extractor / th / translation.py: 57%

97 statements