Coverage for src/wiktextract/extractor/ms/translation.py: 76%

70 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .models import Translation, WordEntry 

6from .tags import translate_raw_tags 

7 

8 

9def extract_translation_section( 

10 wxr: WiktextractContext, 

11 page_data: list[WordEntry], 

12 base_data: WordEntry, 

13 level_node: LevelNode, 

14) -> None: 

15 sense = "" 

16 tr_list = [] 

17 cats = {} 

18 for node in level_node.children: 

19 if isinstance(node, TemplateNode) and node.template_name in [ 

20 "ter-atas", 

21 "teratas", 

22 "trans-top", 

23 ]: 

24 sense = clean_node(wxr, cats, node) 

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

26 for list_item in node.find_child(NodeKind.LIST_ITEM): 

27 tr_list.extend( 

28 extract_translation_list_item(wxr, list_item, sense) 

29 ) 

30 

31 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 base_data.categories.extend(cats.get("categories", [])) 

33 for tr_data in tr_list: 

34 if tr_data.word != "": 

35 base_data.translations.append(tr_data) 

36 base_data.categories.extend(tr_data.categories) 

37 elif level_node.kind == NodeKind.LEVEL3: 37 ↛ 46line 37 didn't jump to line 46 because the condition on line 37 was always true

38 for data in page_data: 

39 if data.lang_code == page_data[-1].lang_code: 39 ↛ 38line 39 didn't jump to line 38 because the condition on line 39 was always true

40 data.categories.extend(cats.get("categories", [])) 

41 for tr_data in tr_list: 

42 if tr_data.word != "": 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 data.translations.append(tr_data) 

44 data.categories.extend(tr_data.categories) 

45 else: 

46 page_data[-1].categories.extend(cats.get("categories", [])) 

47 for tr_data in tr_list: 

48 if tr_data.word != "": 

49 page_data[-1].translations.append(tr_data) 

50 page_data[-1].categories.extend(tr_data.categories) 

51 

52 

53def extract_translation_list_item( 

54 wxr: WiktextractContext, list_item: WikiNode, sense: str 

55) -> None: 

56 tr_list = [] 

57 lang_name = "unknown" 

58 for node in list_item.children: 

59 if ( 

60 isinstance(node, str) 

61 and node.strip().endswith(":") 

62 and lang_name == "unknown" 

63 ): 

64 lang_name = node.strip(": ") or "unknown" 

65 elif isinstance(node, TemplateNode) and node.template_name in [ 

66 "t", 

67 "trad", 

68 "tø", 

69 "t-", 

70 "t+", 

71 ]: 

72 tr_list.append(extract_t_template(wxr, node, sense, lang_name)) 

73 elif ( 73 ↛ 79line 73 didn't jump to line 79 because the condition on line 73 was never true

74 isinstance(node, TemplateNode) 

75 and node.template_name 

76 in ["penerang", "qualifier", "i", "q", "qual"] 

77 and len(tr_list) > 0 

78 ): 

79 raw_tag = clean_node(wxr, None, node).strip("() ") 

80 if raw_tag != "": 

81 tr_list[-1].raw_tags.append(raw_tag) 

82 translate_raw_tags(tr_list[-1]) 

83 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

84 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

85 tr_list.extend( 

86 extract_translation_list_item(wxr, child_list_item, sense) 

87 ) 

88 return tr_list 

89 

90 

91def extract_t_template( 

92 wxr: WiktextractContext, 

93 t_node: TemplateNode, 

94 sense: str, 

95 lang_name: str, 

96) -> Translation: 

97 lang_code = ( 

98 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

99 or "unknown" 

100 ) 

101 tr_data = Translation( 

102 word="", lang=lang_name, lang_code=lang_code, sense=sense 

103 ) 

104 expanded_node = wxr.wtp.parse( 

105 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

106 ) 

107 for span_tag in expanded_node.find_html("span"): 

108 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "": 

109 tr_data.word = clean_node(wxr, None, span_tag) 

110 elif span_tag.attrs.get("class", "") == "gender": 

111 for abbr_tag in span_tag.find_html("abbr"): 

112 raw_tag = clean_node(wxr, None, abbr_tag) 

113 if raw_tag not in ["", "?", "jantina tidak diberi"]: 113 ↛ 111line 113 didn't jump to line 111 because the condition on line 113 was always true

114 tr_data.raw_tags.append(raw_tag) 

115 elif "tr" in span_tag.attrs.get("class", ""): 

116 tr_data.roman = clean_node(wxr, None, span_tag) 

117 if tr_data.word != "": 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was always true

118 translate_raw_tags(tr_data) 

119 for link_node in expanded_node.find_child(NodeKind.LINK): 

120 clean_node(wxr, tr_data, link_node) 

121 return tr_data