Coverage for src / wiktextract / extractor / pl / translation.py: 91%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-29 01:50 +0000

1import re 

2from collections import defaultdict 

3 

4from mediawiki_langcodes import name_to_code 

5from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .models import Translation, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_translation_section( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 level_node: WikiNode, 

17 lang_code: str, 

18) -> None: 

19 from .page import match_sense_index 

20 

21 translations = defaultdict(list) 

22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

23 process_translation_list_item(wxr, list_item, translations) 

24 

25 matched_indexes = set() 

26 for data in page_data: 

27 if data.lang_code == lang_code: 27 ↛ 26line 27 didn't jump to line 26 because the condition on line 27 was always true

28 for sense_index in translations.keys(): 

29 if match_sense_index(sense_index, data): 

30 data.translations.extend(translations[sense_index]) 

31 matched_indexes.add(sense_index) 

32 data.translations.extend(translations.get("", [])) 

33 

34 if "" in translations: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 del translations[""] 

36 for data in page_data: 36 ↛ exitline 36 didn't return from function 'extract_translation_section' because the loop on line 36 didn't complete

37 if data.lang_code == lang_code: 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true

38 for sense_index, translation_list in translations.items(): 

39 if sense_index not in matched_indexes: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 data.translations.extend(translation_list) 

41 break 

42 

43 

44def process_translation_list_item( 

45 wxr: WiktextractContext, 

46 list_item: WikiNode, 

47 translations: dict[str, list[Translation]], 

48) -> None: 

49 lang_name = "unknown" 

50 lang_code = "unknown" 

51 sense_index = "" 

52 last_tr_data = None 

53 last_node = None 

54 raw_tags = [] 

55 for index, node in enumerate(list_item.children): 

56 if isinstance(node, str): 

57 if index == 0 and ":" in node: 

58 lang_name = node[: node.index(":")].strip() or "unknown" 

59 lang_code = name_to_code(lang_name, "pl") or "unknown" 

60 m_index = re.search(r"\(\d+\.\d+\)", node) 

61 if m_index is not None: 

62 sense_index = m_index.group(0).strip("()") 

63 m_roman = re.search(r"\([^()]+\)", node) 

64 if ( 

65 m_roman is not None 

66 and last_tr_data is not None 

67 and (m_index is None or m_index.start() != m_roman.start()) 

68 ): 

69 last_tr_data.roman = m_roman.group(0).strip("()") 

70 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

71 word = clean_node(wxr, None, node) 

72 if len(word) == 0: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 continue 

74 if ( 

75 isinstance(last_node, WikiNode) 

76 and last_node.kind == NodeKind.LINK 

77 and last_tr_data is not None 

78 ): 

79 # two links directly next to each other form one word 

80 last_tr_data.word += word 

81 else: 

82 new_tr_data = Translation( 

83 word=word, 

84 sense_index=sense_index, 

85 lang=lang_name, 

86 lang_code=lang_code, 

87 raw_tags=raw_tags, 

88 ) 

89 translate_raw_tags(new_tr_data) 

90 translations[sense_index].append(new_tr_data) 

91 last_tr_data = new_tr_data 

92 raw_tags.clear() 

93 elif isinstance(node, TemplateNode): 93 ↛ 127line 93 didn't jump to line 127 because the condition on line 93 was always true

94 if node.template_name == "furi": 

95 word, furigana = extract_furi_template(wxr, node) 

96 if ( 

97 isinstance(last_node, WikiNode) 

98 and last_node.kind == NodeKind.LINK 

99 and last_tr_data is not None 

100 ): 

101 last_tr_data.word += word 

102 last_tr_data.ruby = [(word, furigana)] 

103 else: 

104 new_tr_data = Translation( 

105 word=word, 

106 sense_index=sense_index, 

107 lang=lang_name, 

108 lang_code=lang_code, 

109 raw_tags=raw_tags, 

110 ruby=[(word, furigana)], 

111 ) 

112 translate_raw_tags(new_tr_data) 

113 translations[sense_index].append(new_tr_data) 

114 last_tr_data = new_tr_data 

115 raw_tags.clear() 

116 elif isinstance(last_node, str) and ( 

117 "," in last_node or ";" in last_node 

118 ): 

119 raw_tag = clean_node(wxr, None, node) 

120 if len(raw_tag) > 0: 120 ↛ 127line 120 didn't jump to line 127 because the condition on line 120 was always true

121 raw_tags.append(raw_tag) 

122 elif last_tr_data is not None: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true

123 raw_tag = clean_node(wxr, None, node) 

124 if len(raw_tag) > 0: 124 ↛ 127line 124 didn't jump to line 127 because the condition on line 124 was always true

125 last_tr_data.raw_tags.append(raw_tag) 

126 translate_raw_tags(last_tr_data) 

127 last_node = node 

128 

129 

130def extract_furi_template( 

131 wxr: WiktextractContext, node: TemplateNode 

132) -> tuple[str, str]: 

133 # https://pl.wiktionary.org/wiki/Szablon:furi 

134 expanded_node = wxr.wtp.parse( 

135 wxr.wtp.node_to_wikitext(node), expand_all=True 

136 ) 

137 kanji = clean_node(wxr, None, node.template_parameters.get(1, "")) 

138 furigana = "" 

139 for span_tag in expanded_node.find_html_recursively( 

140 "span", attr_name="class", attr_value="furigana-caption" 

141 ): 

142 furigana = clean_node(wxr, None, span_tag).strip("()") 

143 return kanji, furigana