Coverage for src/wiktextract/extractor/pl/page.py: 72%
107 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import itertools
2import re
3from typing import Any
5from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .example import extract_example_section
11from .inflection import extract_inflection_section
12from .linkage import LINKAGE_TYPES, extract_linkage_section
13from .models import Form, Sense, WordEntry
14from .note import extract_note_section
15from .pos import extract_pos_section
16from .sound import extract_sound_section
17from .translation import extract_translation_section
20def parse_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: WikiNode,
25) -> None:
26 # title templates
27 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł
28 title_text = clean_node(wxr, None, level_node.largs)
29 wxr.wtp.start_subsection(title_text)
30 if title_text == "wymowa" and wxr.config.capture_pronunciation: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 extract_sound_section(wxr, base_data, level_node)
32 elif title_text == "znaczenia":
33 extract_pos_section(wxr, page_data, base_data, level_node)
34 elif title_text == "przykłady":
35 extract_example_section(wxr, page_data, base_data, level_node)
36 elif title_text == "etymologia" and wxr.config.capture_etymologies:
37 extract_etymology_section(wxr, page_data, base_data, level_node)
38 elif title_text == "tłumaczenia" and wxr.config.capture_translations: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true
39 extract_translation_section(
40 wxr, page_data, level_node, base_data.lang_code
41 )
42 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections:
43 extract_linkage_section(
44 wxr,
45 page_data,
46 level_node,
47 LINKAGE_TYPES[title_text],
48 base_data.lang_code,
49 )
50 elif title_text == "uwagi":
51 extract_note_section(wxr, page_data, base_data, level_node)
52 elif title_text == "odmiana" and wxr.config.capture_inflections:
53 extract_inflection_section(
54 wxr, page_data, base_data.lang_code, level_node
55 )
56 elif title_text == "zapis":
57 extract_zapis_section(wxr, base_data, level_node)
58 elif title_text == "transliteracja":
59 extract_transliteracja_section(wxr, base_data, level_node)
62def extract_zapis_section(
63 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
64) -> None:
65 # get around "preformatted" node
66 for node in level_node.find_child_recursively(NodeKind.TEMPLATE):
67 if node.template_name == "ptrad": 67 ↛ 66line 67 didn't jump to line 66 because the condition on line 67 was always true
68 form_text = clean_node(
69 wxr, None, node.template_parameters.get(1, "")
70 )
71 if form_text != "": 71 ↛ 66line 71 didn't jump to line 66 because the condition on line 71 was always true
72 base_data.forms.append(
73 Form(form=form_text, tags=["Traditional Chinese"])
74 )
77def extract_transliteracja_section(
78 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
79) -> None:
80 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
81 for node in list_item.children:
82 if isinstance(node, str): 82 ↛ 81line 82 didn't jump to line 81 because the condition on line 82 was always true
83 m = re.search(r"\([\d\s,-.]+\)", node)
84 if m is not None: 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true
85 sense_index = m.group(0).strip("()")
86 roman = node[m.end() :].strip()
87 if roman != "": 87 ↛ 81line 87 didn't jump to line 81 because the condition on line 87 was always true
88 base_data.forms.append(
89 Form(
90 form=roman,
91 sense_index=sense_index,
92 tags=["romanization"],
93 )
94 )
97def parse_page(
98 wxr: WiktextractContext, page_title: str, page_text: str
99) -> list[dict[str, Any]]:
100 # page layout
101 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł
102 wxr.wtp.start_page(page_title)
103 tree = wxr.wtp.parse(page_text, pre_expand=True)
104 page_data: list[WordEntry] = []
105 for level2_node in tree.find_child(NodeKind.LEVEL2):
106 after_parenthesis = False
107 lang_code = "unknown"
108 lang_name = "unknown"
109 lang_title_cats = {}
110 for title_content_node in itertools.chain.from_iterable( 110 ↛ 130line 110 didn't jump to line 130 because the loop on line 110 didn't complete
111 level2_node.largs
112 ):
113 if isinstance(
114 title_content_node, str
115 ) and title_content_node.strip().endswith("("):
116 after_parenthesis = True
117 elif (
118 isinstance(title_content_node, TemplateNode)
119 and after_parenthesis
120 ):
121 expanded_template = wxr.wtp.parse(
122 wxr.wtp.node_to_wikitext(title_content_node),
123 expand_all=True,
124 )
125 for span_tag in expanded_template.find_html("span"):
126 lang_code = span_tag.attrs.get("id", "")
127 break
128 lang_name = clean_node(wxr, lang_title_cats, expanded_template)
129 break
130 if ( 130 ↛ 134line 130 didn't jump to line 134 because the condition on line 130 was never true
131 wxr.config.capture_language_codes is not None
132 and lang_code not in wxr.config.capture_language_codes
133 ):
134 continue
135 wxr.wtp.start_section(lang_name)
136 base_data = WordEntry(
137 word=wxr.wtp.title,
138 lang_code=lang_code,
139 lang=lang_name,
140 pos="unknown",
141 categories=lang_title_cats.get("categories", []),
142 )
143 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
144 parse_section(wxr, page_data, base_data, level3_node)
146 for data in page_data:
147 if len(data.senses) == 0: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 data.senses.append(Sense(tags=["no-gloss"]))
149 return [m.model_dump(exclude_defaults=True) for m in page_data]
152def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool:
153 # return `True` if `WordEntry` has a `Sense` with same POS section
154 # index number, usually the first number before "."
155 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0:
156 return False
157 if hasattr(word_entry, "senses"): 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true
158 sense = word_entry.senses[0]
159 elif isinstance(word_entry, Sense):
160 sense = word_entry
161 # find exact match for index like "1.1"
162 exact_match = not (
163 "," in sense_index or "-" in sense_index or "." not in sense_index
164 )
165 if exact_match:
166 return sense_index == sense.sense_index
168 pos_index_str = sense.sense_index[: sense_index.find(".")]
169 pos_section_index = 0
170 if pos_index_str.isdigit(): 170 ↛ 173line 170 didn't jump to line 173 because the condition on line 170 was always true
171 pos_section_index = int(pos_index_str)
172 else:
173 return False
175 for part_of_index in sense_index.split(","):
176 part_of_index = part_of_index.strip()
177 if (
178 "." in part_of_index
179 and pos_index_str == part_of_index[: part_of_index.find(".")]
180 ):
181 return True
182 elif re.fullmatch(r"\d+-\d+", part_of_index): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 start_str, end_str = part_of_index.split("-")
184 if int(start_str) <= pos_section_index and pos_section_index <= int(
185 end_str
186 ):
187 return True
189 return False