Coverage for src / wiktextract / extractor / pl / translation.py: 91%
86 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
1import re
2from collections import defaultdict
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .models import Translation, WordEntry
10from .tags import translate_raw_tags
13def extract_translation_section(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 level_node: WikiNode,
17 lang_code: str,
18) -> None:
19 from .page import match_sense_index
21 translations = defaultdict(list)
22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
23 process_translation_list_item(wxr, list_item, translations)
25 matched_indexes = set()
26 for data in page_data:
27 if data.lang_code == lang_code: 27 ↛ 26line 27 didn't jump to line 26 because the condition on line 27 was always true
28 for sense_index in translations.keys():
29 if match_sense_index(sense_index, data):
30 data.translations.extend(translations[sense_index])
31 matched_indexes.add(sense_index)
32 data.translations.extend(translations.get("", []))
34 if "" in translations: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 del translations[""]
36 for data in page_data: 36 ↛ exitline 36 didn't return from function 'extract_translation_section' because the loop on line 36 didn't complete
37 if data.lang_code == lang_code: 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true
38 for sense_index, translation_list in translations.items():
39 if sense_index not in matched_indexes: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 data.translations.extend(translation_list)
41 break
44def process_translation_list_item(
45 wxr: WiktextractContext,
46 list_item: WikiNode,
47 translations: dict[str, list[Translation]],
48) -> None:
49 lang_name = "unknown"
50 lang_code = "unknown"
51 sense_index = ""
52 last_tr_data = None
53 last_node = None
54 raw_tags = []
55 for index, node in enumerate(list_item.children):
56 if isinstance(node, str):
57 if index == 0 and ":" in node:
58 lang_name = node[: node.index(":")].strip() or "unknown"
59 lang_code = name_to_code(lang_name, "pl") or "unknown"
60 m_index = re.search(r"\(\d+\.\d+\)", node)
61 if m_index is not None:
62 sense_index = m_index.group(0).strip("()")
63 m_roman = re.search(r"\([^()]+\)", node)
64 if (
65 m_roman is not None
66 and last_tr_data is not None
67 and (m_index is None or m_index.start() != m_roman.start())
68 ):
69 last_tr_data.roman = m_roman.group(0).strip("()")
70 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
71 word = clean_node(wxr, None, node)
72 if len(word) == 0: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 continue
74 if (
75 isinstance(last_node, WikiNode)
76 and last_node.kind == NodeKind.LINK
77 and last_tr_data is not None
78 ):
79 # two links directly next to each other form one word
80 last_tr_data.word += word
81 else:
82 new_tr_data = Translation(
83 word=word,
84 sense_index=sense_index,
85 lang=lang_name,
86 lang_code=lang_code,
87 raw_tags=raw_tags,
88 )
89 translate_raw_tags(new_tr_data)
90 translations[sense_index].append(new_tr_data)
91 last_tr_data = new_tr_data
92 raw_tags.clear()
93 elif isinstance(node, TemplateNode): 93 ↛ 127line 93 didn't jump to line 127 because the condition on line 93 was always true
94 if node.template_name == "furi":
95 word, furigana = extract_furi_template(wxr, node)
96 if (
97 isinstance(last_node, WikiNode)
98 and last_node.kind == NodeKind.LINK
99 and last_tr_data is not None
100 ):
101 last_tr_data.word += word
102 last_tr_data.ruby = [(word, furigana)]
103 else:
104 new_tr_data = Translation(
105 word=word,
106 sense_index=sense_index,
107 lang=lang_name,
108 lang_code=lang_code,
109 raw_tags=raw_tags,
110 ruby=[(word, furigana)],
111 )
112 translate_raw_tags(new_tr_data)
113 translations[sense_index].append(new_tr_data)
114 last_tr_data = new_tr_data
115 raw_tags.clear()
116 elif isinstance(last_node, str) and (
117 "," in last_node or ";" in last_node
118 ):
119 raw_tag = clean_node(wxr, None, node)
120 if len(raw_tag) > 0: 120 ↛ 127line 120 didn't jump to line 127 because the condition on line 120 was always true
121 raw_tags.append(raw_tag)
122 elif last_tr_data is not None: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true
123 raw_tag = clean_node(wxr, None, node)
124 if len(raw_tag) > 0: 124 ↛ 127line 124 didn't jump to line 127 because the condition on line 124 was always true
125 last_tr_data.raw_tags.append(raw_tag)
126 translate_raw_tags(last_tr_data)
127 last_node = node
130def extract_furi_template(
131 wxr: WiktextractContext, node: TemplateNode
132) -> tuple[str, str]:
133 # https://pl.wiktionary.org/wiki/Szablon:furi
134 expanded_node = wxr.wtp.parse(
135 wxr.wtp.node_to_wikitext(node), expand_all=True
136 )
137 kanji = clean_node(wxr, None, node.template_parameters.get(1, ""))
138 furigana = ""
139 for span_tag in expanded_node.find_html_recursively(
140 "span", attr_name="class", attr_value="furigana-caption"
141 ):
142 furigana = clean_node(wxr, None, span_tag).strip("()")
143 return kanji, furigana