Coverage for src/wiktextract/extractor/ms/translation.py: 76%

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from .models import Translation, WordEntry

6from .tags import translate_raw_tags

9def extract_translation_section(

10 wxr: WiktextractContext,

11 page_data: list[WordEntry],

12 base_data: WordEntry,

13 level_node: LevelNode,

14) -> None:

15 sense = ""

16 tr_list = []

17 cats = {}

18 for node in level_node.children:

19 if isinstance(node, TemplateNode) and node.template_name in [

20 "ter-atas",

21 "teratas",

22 "trans-top",

23 ]:

24 sense = clean_node(wxr, cats, node)

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

26 for list_item in node.find_child(NodeKind.LIST_ITEM):

27 tr_list.extend(

28 extract_translation_list_item(wxr, list_item, sense)

29 )

31 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 base_data.categories.extend(cats.get("categories", []))

33 for tr_data in tr_list:

34 if tr_data.word != "":

35 base_data.translations.append(tr_data)

36 base_data.categories.extend(tr_data.categories)

37 elif level_node.kind == NodeKind.LEVEL3: 37 ↛ 46line 37 didn't jump to line 46 because the condition on line 37 was always true

38 for data in page_data:

39 if data.lang_code == page_data[-1].lang_code: 39 ↛ 38line 39 didn't jump to line 38 because the condition on line 39 was always true

40 data.categories.extend(cats.get("categories", []))

41 for tr_data in tr_list:

42 if tr_data.word != "": 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 data.translations.append(tr_data)

44 data.categories.extend(tr_data.categories)

45 else:

46 page_data[-1].categories.extend(cats.get("categories", []))

47 for tr_data in tr_list:

48 if tr_data.word != "":

49 page_data[-1].translations.append(tr_data)

50 page_data[-1].categories.extend(tr_data.categories)

53def extract_translation_list_item(

54 wxr: WiktextractContext, list_item: WikiNode, sense: str

55) -> None:

56 tr_list = []

57 lang_name = "unknown"

58 for node in list_item.children:

59 if (

60 isinstance(node, str)

61 and node.strip().endswith(":")

62 and lang_name == "unknown"

63 ):

64 lang_name = node.strip(": ") or "unknown"

65 elif isinstance(node, TemplateNode) and node.template_name in [

66 "t",

67 "trad",

68 "tø",

69 "t-",

70 "t+",

71 ]:

72 tr_list.append(extract_t_template(wxr, node, sense, lang_name))

73 elif ( 73 ↛ 79line 73 didn't jump to line 79 because the condition on line 73 was never true

74 isinstance(node, TemplateNode)

75 and node.template_name

76 in ["penerang", "qualifier", "i", "q", "qual"]

77 and len(tr_list) > 0

78 ):

79 raw_tag = clean_node(wxr, None, node).strip("() ")

80 if raw_tag != "":

81 tr_list[-1].raw_tags.append(raw_tag)

82 translate_raw_tags(tr_list[-1])

83 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

84 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

85 tr_list.extend(

86 extract_translation_list_item(wxr, child_list_item, sense)

87 )

88 return tr_list

91def extract_t_template(

92 wxr: WiktextractContext,

93 t_node: TemplateNode,

94 sense: str,

95 lang_name: str,

96) -> Translation:

97 lang_code = (

98 clean_node(wxr, None, t_node.template_parameters.get(1, ""))

99 or "unknown"

100 )

101 tr_data = Translation(

102 word="", lang=lang_name, lang_code=lang_code, sense=sense

103 )

104 expanded_node = wxr.wtp.parse(

105 wxr.wtp.node_to_wikitext(t_node), expand_all=True

106 )

107 for span_tag in expanded_node.find_html("span"):

108 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":

109 tr_data.word = clean_node(wxr, None, span_tag)

110 elif span_tag.attrs.get("class", "") == "gender":

111 for abbr_tag in span_tag.find_html("abbr"):

112 raw_tag = clean_node(wxr, None, abbr_tag)

113 if raw_tag not in ["", "?", "jantina tidak diberi"]: 113 ↛ 111line 113 didn't jump to line 111 because the condition on line 113 was always true

114 tr_data.raw_tags.append(raw_tag)

115 elif "tr" in span_tag.attrs.get("class", ""):

116 tr_data.roman = clean_node(wxr, None, span_tag)

117 if tr_data.word != "": 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was always true

118 translate_raw_tags(tr_data)

119 for link_node in expanded_node.find_child(NodeKind.LINK):

120 clean_node(wxr, tr_data, link_node)

121 return tr_data