Coverage for src/wiktextract/extractor/pl/translation.py: 89%

88 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from collections import defaultdict 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .models import Translation, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_translation_section( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 level_node: WikiNode, 

17 lang_code: str, 

18) -> None: 

19 from .page import match_sense_index 

20 

21 translations = defaultdict(list) 

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

23 process_translation_list_item(wxr, list_item, translations) 

24 

25 matched_indexes = set() 

26 for data in page_data: 

27 if data.lang_code == lang_code: 27 ↛ 26line 27 didn't jump to line 26 because the condition on line 27 was always true

28 for sense_index in translations.keys(): 

29 if match_sense_index(sense_index, data): 

30 data.translations.extend(translations[sense_index]) 

31 matched_indexes.add(sense_index) 

32 data.translations.extend(translations.get("", [])) 

33 

34 if "" in translations: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 del translations[""] 

36 for data in page_data: 36 ↛ exitline 36 didn't return from function 'extract_translation_section' because the loop on line 36 didn't complete

37 if data.lang_code == lang_code: 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true

38 for sense_index, translation_list in translations.items(): 

39 if sense_index not in matched_indexes: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 data.translations.extend(translation_list) 

41 break 

42 

43 

44def process_translation_list_item( 

45 wxr: WiktextractContext, 

46 list_item: WikiNode, 

47 translations: dict[str, list[Translation]], 

48) -> None: 

49 lang_name = "" 

50 lang_code = "" 

51 sense_index = "" 

52 last_tr_data = None 

53 last_node = None 

54 raw_tags = [] 

55 for index, node in enumerate(list_item.children): 

56 if isinstance(node, str): 

57 if index == 0 and ":" in node: 

58 lang_name = node[: node.index(":")].strip() 

59 lang_code = name_to_code(lang_name, "pl") 

60 if lang_code == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 lang_code = "unknown" 

62 m_index = re.search(r"\(\d+\.\d+\)", node) 

63 if m_index is not None: 

64 sense_index = m_index.group(0).strip("()") 

65 m_roman = re.search(r"\([^()]+\)", node) 

66 if ( 

67 m_roman is not None 

68 and last_tr_data is not None 

69 and (m_index is None or m_index.start() != m_roman.start()) 

70 ): 

71 last_tr_data.roman = m_roman.group(0).strip("()") 

72 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

73 word = clean_node(wxr, None, node) 

74 if len(word) == 0: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 continue 

76 if ( 

77 isinstance(last_node, WikiNode) 

78 and last_node.kind == NodeKind.LINK 

79 and last_tr_data is not None 

80 ): 

81 # two links directly next to each other form one word 

82 last_tr_data.word += word 

83 else: 

84 new_tr_data = Translation( 

85 word=word, 

86 sense_index=sense_index, 

87 lang=lang_name, 

88 lang_code=lang_code, 

89 raw_tags=raw_tags, 

90 ) 

91 translate_raw_tags(new_tr_data) 

92 translations[sense_index].append(new_tr_data) 

93 last_tr_data = new_tr_data 

94 raw_tags.clear() 

95 elif isinstance(node, TemplateNode): 95 ↛ 129line 95 didn't jump to line 129 because the condition on line 95 was always true

96 if node.template_name == "furi": 

97 word, furigana = extract_furi_template(wxr, node) 

98 if ( 

99 isinstance(last_node, WikiNode) 

100 and last_node.kind == NodeKind.LINK 

101 and last_tr_data is not None 

102 ): 

103 last_tr_data.word += word 

104 last_tr_data.ruby = [(word, furigana)] 

105 else: 

106 new_tr_data = Translation( 

107 word=word, 

108 sense_index=sense_index, 

109 lang=lang_name, 

110 lang_code=lang_code, 

111 raw_tags=raw_tags, 

112 ruby=[(word, furigana)], 

113 ) 

114 translate_raw_tags(new_tr_data) 

115 translations[sense_index].append(new_tr_data) 

116 last_tr_data = new_tr_data 

117 raw_tags.clear() 

118 elif isinstance(last_node, str) and ( 

119 "," in last_node or ";" in last_node 

120 ): 

121 raw_tag = clean_node(wxr, None, node) 

122 if len(raw_tag) > 0: 122 ↛ 129line 122 didn't jump to line 129 because the condition on line 122 was always true

123 raw_tags.append(raw_tag) 

124 elif last_tr_data is not None: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true

125 raw_tag = clean_node(wxr, None, node) 

126 if len(raw_tag) > 0: 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true

127 last_tr_data.raw_tags.append(raw_tag) 

128 translate_raw_tags(last_tr_data) 

129 last_node = node 

130 

131 

132def extract_furi_template( 

133 wxr: WiktextractContext, node: TemplateNode 

134) -> tuple[str, str]: 

135 # https://pl.wiktionary.org/wiki/Szablon:furi 

136 expanded_node = wxr.wtp.parse( 

137 wxr.wtp.node_to_wikitext(node), expand_all=True 

138 ) 

139 kanji = clean_node(wxr, None, node.template_parameters.get(1, "")) 

140 furigana = "" 

141 for span_tag in expanded_node.find_html_recursively( 

142 "span", attr_name="class", attr_value="furigana-caption" 

143 ): 

144 furigana = clean_node(wxr, None, span_tag).strip("()") 

145 return kanji, furigana