Coverage for src/wiktextract/extractor/pl/translation.py: 89%

1import re

2from collections import defaultdict

4from mediawiki_langcodes import name_to_code

5from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .models import Translation, WordEntry

10from .tags import translate_raw_tags

13def extract_translation_section(

14 wxr: WiktextractContext,

15 page_data: list[WordEntry],

16 level_node: WikiNode,

17 lang_code: str,

18) -> None:

19 from .page import match_sense_index

21 translations = defaultdict(list)

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

23 process_translation_list_item(wxr, list_item, translations)

25 matched_indexes = set()

26 for data in page_data:

27 if data.lang_code == lang_code: 27 ↛ 26line 27 didn't jump to line 26 because the condition on line 27 was always true

28 for sense_index in translations.keys():

29 if match_sense_index(sense_index, data):

30 data.translations.extend(translations[sense_index])

31 matched_indexes.add(sense_index)

32 data.translations.extend(translations.get("", []))

34 if "" in translations: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 del translations[""]

36 for data in page_data: 36 ↛ exitline 36 didn't return from function 'extract_translation_section' because the loop on line 36 didn't complete

37 if data.lang_code == lang_code: 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true

38 for sense_index, translation_list in translations.items():

39 if sense_index not in matched_indexes: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 data.translations.extend(translation_list)

41 break

44def process_translation_list_item(

45 wxr: WiktextractContext,

46 list_item: WikiNode,

47 translations: dict[str, list[Translation]],

48) -> None:

49 lang_name = ""

50 lang_code = ""

51 sense_index = ""

52 last_tr_data = None

53 last_node = None

54 raw_tags = []

55 for index, node in enumerate(list_item.children):

56 if isinstance(node, str):

57 if index == 0 and ":" in node:

58 lang_name = node[: node.index(":")].strip()

59 lang_code = name_to_code(lang_name, "pl")

60 if lang_code == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 lang_code = "unknown"

62 m_index = re.search(r"\(\d+\.\d+\)", node)

63 if m_index is not None:

64 sense_index = m_index.group(0).strip("()")

65 m_roman = re.search(r"\([^()]+\)", node)

66 if (

67 m_roman is not None

68 and last_tr_data is not None

69 and (m_index is None or m_index.start() != m_roman.start())

70 ):

71 last_tr_data.roman = m_roman.group(0).strip("()")

72 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

73 word = clean_node(wxr, None, node)

74 if len(word) == 0: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 continue

76 if (

77 isinstance(last_node, WikiNode)

78 and last_node.kind == NodeKind.LINK

79 and last_tr_data is not None

80 ):

81 # two links directly next to each other form one word

82 last_tr_data.word += word

83 else:

84 new_tr_data = Translation(

85 word=word,

86 sense_index=sense_index,

87 lang=lang_name,

88 lang_code=lang_code,

89 raw_tags=raw_tags,

90 )

91 translate_raw_tags(new_tr_data)

92 translations[sense_index].append(new_tr_data)

93 last_tr_data = new_tr_data

94 raw_tags.clear()

95 elif isinstance(node, TemplateNode): 95 ↛ 129line 95 didn't jump to line 129 because the condition on line 95 was always true

96 if node.template_name == "furi":

97 word, furigana = extract_furi_template(wxr, node)

98 if (

99 isinstance(last_node, WikiNode)

100 and last_node.kind == NodeKind.LINK

101 and last_tr_data is not None

102 ):

103 last_tr_data.word += word

104 last_tr_data.ruby = [(word, furigana)]

105 else:

106 new_tr_data = Translation(

107 word=word,

108 sense_index=sense_index,

109 lang=lang_name,

110 lang_code=lang_code,

111 raw_tags=raw_tags,

112 ruby=[(word, furigana)],

113 )

114 translate_raw_tags(new_tr_data)

115 translations[sense_index].append(new_tr_data)

116 last_tr_data = new_tr_data

117 raw_tags.clear()

118 elif isinstance(last_node, str) and (

119 "," in last_node or ";" in last_node

120 ):

121 raw_tag = clean_node(wxr, None, node)

122 if len(raw_tag) > 0: 122 ↛ 129line 122 didn't jump to line 129 because the condition on line 122 was always true

123 raw_tags.append(raw_tag)

124 elif last_tr_data is not None: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true

125 raw_tag = clean_node(wxr, None, node)

126 if len(raw_tag) > 0: 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true

127 last_tr_data.raw_tags.append(raw_tag)

128 translate_raw_tags(last_tr_data)

129 last_node = node

130

131

132def extract_furi_template(

133 wxr: WiktextractContext, node: TemplateNode

134) -> tuple[str, str]:

135 # https://pl.wiktionary.org/wiki/Szablon:furi

136 expanded_node = wxr.wtp.parse(

137 wxr.wtp.node_to_wikitext(node), expand_all=True

138 )

139 kanji = clean_node(wxr, None, node.template_parameters.get(1, ""))

140 furigana = ""

141 for span_tag in expanded_node.find_html_recursively(

142 "span", attr_name="class", attr_value="furigana-caption"

143 ):

144 furigana = clean_node(wxr, None, span_tag).strip("()")

145 return kanji, furigana