Coverage for src/wiktextract/extractor/pl/translation.py: 89%
88 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from collections import defaultdict
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .models import Translation, WordEntry
10from .tags import translate_raw_tags
13def extract_translation_section(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 level_node: WikiNode,
17 lang_code: str,
18) -> None:
19 from .page import match_sense_index
21 translations = defaultdict(list)
22 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
23 process_translation_list_item(wxr, list_item, translations)
25 matched_indexes = set()
26 for data in page_data:
27 if data.lang_code == lang_code: 27 ↛ 26line 27 didn't jump to line 26 because the condition on line 27 was always true
28 for sense_index in translations.keys():
29 if match_sense_index(sense_index, data):
30 data.translations.extend(translations[sense_index])
31 matched_indexes.add(sense_index)
32 data.translations.extend(translations.get("", []))
34 if "" in translations: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 del translations[""]
36 for data in page_data: 36 ↛ exitline 36 didn't return from function 'extract_translation_section' because the loop on line 36 didn't complete
37 if data.lang_code == lang_code: 37 ↛ 36line 37 didn't jump to line 36 because the condition on line 37 was always true
38 for sense_index, translation_list in translations.items():
39 if sense_index not in matched_indexes: 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 data.translations.extend(translation_list)
41 break
44def process_translation_list_item(
45 wxr: WiktextractContext,
46 list_item: WikiNode,
47 translations: dict[str, list[Translation]],
48) -> None:
49 lang_name = ""
50 lang_code = ""
51 sense_index = ""
52 last_tr_data = None
53 last_node = None
54 raw_tags = []
55 for index, node in enumerate(list_item.children):
56 if isinstance(node, str):
57 if index == 0 and ":" in node:
58 lang_name = node[: node.index(":")].strip()
59 lang_code = name_to_code(lang_name, "pl")
60 if lang_code == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 lang_code = "unknown"
62 m_index = re.search(r"\(\d+\.\d+\)", node)
63 if m_index is not None:
64 sense_index = m_index.group(0).strip("()")
65 m_roman = re.search(r"\([^()]+\)", node)
66 if (
67 m_roman is not None
68 and last_tr_data is not None
69 and (m_index is None or m_index.start() != m_roman.start())
70 ):
71 last_tr_data.roman = m_roman.group(0).strip("()")
72 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
73 word = clean_node(wxr, None, node)
74 if len(word) == 0: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 continue
76 if (
77 isinstance(last_node, WikiNode)
78 and last_node.kind == NodeKind.LINK
79 and last_tr_data is not None
80 ):
81 # two links directly next to each other form one word
82 last_tr_data.word += word
83 else:
84 new_tr_data = Translation(
85 word=word,
86 sense_index=sense_index,
87 lang=lang_name,
88 lang_code=lang_code,
89 raw_tags=raw_tags,
90 )
91 translate_raw_tags(new_tr_data)
92 translations[sense_index].append(new_tr_data)
93 last_tr_data = new_tr_data
94 raw_tags.clear()
95 elif isinstance(node, TemplateNode): 95 ↛ 129line 95 didn't jump to line 129 because the condition on line 95 was always true
96 if node.template_name == "furi":
97 word, furigana = extract_furi_template(wxr, node)
98 if (
99 isinstance(last_node, WikiNode)
100 and last_node.kind == NodeKind.LINK
101 and last_tr_data is not None
102 ):
103 last_tr_data.word += word
104 last_tr_data.ruby = [(word, furigana)]
105 else:
106 new_tr_data = Translation(
107 word=word,
108 sense_index=sense_index,
109 lang=lang_name,
110 lang_code=lang_code,
111 raw_tags=raw_tags,
112 ruby=[(word, furigana)],
113 )
114 translate_raw_tags(new_tr_data)
115 translations[sense_index].append(new_tr_data)
116 last_tr_data = new_tr_data
117 raw_tags.clear()
118 elif isinstance(last_node, str) and (
119 "," in last_node or ";" in last_node
120 ):
121 raw_tag = clean_node(wxr, None, node)
122 if len(raw_tag) > 0: 122 ↛ 129line 122 didn't jump to line 129 because the condition on line 122 was always true
123 raw_tags.append(raw_tag)
124 elif last_tr_data is not None: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true
125 raw_tag = clean_node(wxr, None, node)
126 if len(raw_tag) > 0: 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true
127 last_tr_data.raw_tags.append(raw_tag)
128 translate_raw_tags(last_tr_data)
129 last_node = node
132def extract_furi_template(
133 wxr: WiktextractContext, node: TemplateNode
134) -> tuple[str, str]:
135 # https://pl.wiktionary.org/wiki/Szablon:furi
136 expanded_node = wxr.wtp.parse(
137 wxr.wtp.node_to_wikitext(node), expand_all=True
138 )
139 kanji = clean_node(wxr, None, node.template_parameters.get(1, ""))
140 furigana = ""
141 for span_tag in expanded_node.find_html_recursively(
142 "span", attr_name="class", attr_value="furigana-caption"
143 ):
144 furigana = clean_node(wxr, None, span_tag).strip("()")
145 return kanji, furigana