Coverage for src/wiktextract/extractor/th/translation.py: 68%

76 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from .models import Translation, WordEntry 

7from .tags import translate_raw_tags 

8 

9 

10def extract_translation_section( 

11 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

12) -> None: 

13 sense = "" 

14 for node in level_node.children: 

15 if isinstance(node, TemplateNode) and node.template_name == "trans-top": 

16 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

17 clean_node(wxr, word_entry, node) 

18 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

19 for list_item in node.find_child(NodeKind.LIST_ITEM): 

20 extract_translation_list_item(wxr, word_entry, list_item, sense) 

21 

22 

23def extract_translation_list_item( 

24 wxr: WiktextractContext, 

25 word_entry: WordEntry, 

26 list_item: WikiNode, 

27 sense: str, 

28) -> None: 

29 lang_name = "unknown" 

30 lang_code = "unknown" 

31 for index, node in enumerate(list_item.children): 

32 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

33 lang_name = ( 

34 clean_node(wxr, None, list_item.children[:index]) 

35 + node[: node.index(":")].strip() 

36 ) 

37 if lang_name == "": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 lang_name = "unknown" 

39 if lang_name != "unknown": 39 ↛ 31line 39 didn't jump to line 31 because the condition on line 39 was always true

40 lang_code = name_to_code(lang_name, "th") 

41 if lang_code == "": 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 lang_code = "unknown" 

43 elif isinstance(node, TemplateNode) and node.template_name in [ 

44 "t", 

45 "t+", 

46 "t-simple", 

47 ]: 

48 extract_t_template(wxr, word_entry, node, lang_name, sense) 

49 elif ( 

50 isinstance(node, WikiNode) 

51 and node.kind == NodeKind.LINK 

52 and lang_name != "unknown" 

53 ): 

54 word = clean_node(wxr, None, node) 

55 if word != "": 55 ↛ 31line 55 didn't jump to line 31 because the condition on line 55 was always true

56 word_entry.translations.append( 

57 Translation( 

58 word=word, 

59 lang=lang_name, 

60 lang_code=lang_code, 

61 sense=sense, 

62 ) 

63 ) 

64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

65 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

66 extract_translation_list_item( 

67 wxr, word_entry, child_list_item, sense 

68 ) 

69 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 for link_node in node.find_child(NodeKind.LINK): 

71 link_str = clean_node(wxr, None, link_node) 

72 if link_str.endswith("/คำแปลภาษาอื่น"): 

73 extract_translation_page(wxr, word_entry, link_str) 

74 

75 

76def extract_t_template( 

77 wxr: WiktextractContext, 

78 word_entry: WordEntry, 

79 t_node: TemplateNode, 

80 lang_name: str, 

81 sense: str, 

82) -> None: 

83 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

84 if lang_code == "": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 lang_code = "unknown" 

86 tr_data = Translation( 

87 word="", lang=lang_name, lang_code=lang_code, sense=sense 

88 ) 

89 expanded_node = wxr.wtp.parse( 

90 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

91 ) 

92 for span_tag in expanded_node.find_html_recursively("span"): 

93 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "": 

94 tr_data.word = clean_node(wxr, None, span_tag) 

95 else: 

96 span_class = span_tag.attrs.get("class", "") 

97 if "Latn" in span_class: 

98 tr_data.roman = clean_node(wxr, None, span_tag) 

99 

100 tr_data.lit = clean_node( 

101 wxr, None, t_node.template_parameters.get("lit", "") 

102 ) 

103 for abbr_tag in expanded_node.find_html_recursively("abbr"): 

104 tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag)) 

105 

106 if tr_data.word != "": 106 ↛ exitline 106 didn't return from function 'extract_t_template' because the condition on line 106 was always true

107 translate_raw_tags(tr_data) 

108 word_entry.translations.append(tr_data) 

109 for link_node in expanded_node.find_child(NodeKind.LINK): 

110 clean_node(wxr, word_entry, link_node) 

111 

112 

113def extract_translation_page( 

114 wxr: WiktextractContext, 

115 word_entry: WordEntry, 

116 page_title: str, 

117) -> None: 

118 page = wxr.wtp.get_page(page_title, 0) 

119 if page is None or page.body is None: 

120 return 

121 root = wxr.wtp.parse(page.body) 

122 for level2_node in root.find_child(NodeKind.LEVEL2): 

123 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix( 

124 "ภาษา" 

125 ) 

126 if lang_name != word_entry.lang: 

127 continue 

128 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

129 pos_title = clean_node(wxr, None, level3_node.largs) 

130 if pos_title != word_entry.pos_title: 

131 continue 

132 for tr_level_node in level3_node.find_child(NodeKind.LEVEL4): 

133 extract_translation_section(wxr, word_entry, tr_level_node)