Coverage for src/wiktextract/extractor/nl/page.py: 82%
105 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .descendant import extract_descendant_section
14from .etymology import extract_etymology_section
15from .inflection import FORMS_TABLE_TEMPLATES, extract_inflection_template
16from .linkage import extract_fixed_preposition_section, extract_linkage_section
17from .models import Etymology, Sense, WordEntry
18from .pos import extract_pos_section
19from .section_titles import LINKAGE_SECTIONS, POS_DATA
20from .sound import extract_hyphenation_section, extract_sound_section
21from .spelling_form import extract_spelling_form_section
22from .translation import extract_translation_section
25def extract_section_categories(
26 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
27) -> None:
28 for link_node in level_node.find_child(NodeKind.LINK):
29 clean_node(wxr, word_entry, link_node)
32def select_word_entry(
33 page_data: list[WordEntry], base_data: WordEntry
34) -> WordEntry:
35 # use a function not a variable because new data could be appended to
36 # `page_data` after the variable is created
37 return (
38 page_data[-1]
39 if len(page_data) > 0 and page_data[-1].lang_code == base_data.lang_code
40 else base_data
41 )
44def parse_section(
45 wxr: WiktextractContext,
46 page_data: list[WordEntry],
47 base_data: WordEntry,
48 forms_data: WordEntry,
49 level_node: LevelNode,
50) -> list[Etymology]:
51 # title templates
52 # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
53 title_text = clean_node(wxr, None, level_node.largs)
54 title_text = re.sub(r"\s+#?\d+:?$", "", title_text)
55 wxr.wtp.start_subsection(title_text)
56 etymology_data = []
58 if title_text in POS_DATA:
59 last_data_len = len(page_data)
60 extract_pos_section(
61 wxr, page_data, base_data, forms_data, level_node, title_text
62 )
63 if len(page_data) == last_data_len and title_text in LINKAGE_SECTIONS:
64 extract_linkage_section(
65 wxr,
66 page_data[-1] if len(page_data) > 0 else base_data,
67 level_node,
68 LINKAGE_SECTIONS[title_text],
69 )
70 elif title_text == "Uitspraak":
71 extract_sound_section(
72 wxr, select_word_entry(page_data, base_data), level_node
73 )
74 elif title_text in LINKAGE_SECTIONS:
75 extract_linkage_section(
76 wxr,
77 select_word_entry(page_data, base_data),
78 level_node,
79 LINKAGE_SECTIONS[title_text],
80 )
81 elif title_text == "Vertalingen":
82 extract_translation_section(
83 wxr, select_word_entry(page_data, base_data), level_node
84 )
85 elif title_text == "Woordafbreking":
86 extract_hyphenation_section(
87 wxr, select_word_entry(page_data, base_data), level_node
88 )
89 elif title_text == "Woordherkomst en -opbouw":
90 etymology_data = extract_etymology_section(wxr, level_node)
91 elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]:
92 extract_spelling_form_section(
93 wxr, select_word_entry(page_data, base_data), level_node
94 )
95 elif title_text == "Opmerkingen": 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 extract_note_section(
97 wxr, select_word_entry(page_data, base_data), level_node
98 )
99 elif title_text == "Overerving en ontlening": 99 ↛ 103line 99 didn't jump to line 103 because the condition on line 99 was always true
100 extract_descendant_section(
101 wxr, select_word_entry(page_data, base_data), level_node
102 )
103 elif title_text == "Vaste voorzetsels":
104 extract_fixed_preposition_section(
105 wxr, select_word_entry(page_data, base_data), level_node
106 )
107 elif title_text in [
108 "Gangbaarheid",
109 "Meer informatie",
110 "Verwijzingen",
111 "Citaten",
112 ]:
113 pass # ignore
114 elif not title_text.startswith(("Vervoeging", "Verbuiging")):
115 wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60")
117 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
118 parse_section(wxr, page_data, base_data, forms_data, next_level)
119 extract_section_categories(
120 wxr, select_word_entry(page_data, base_data), level_node
121 )
122 is_first_forms_template = True
123 for t_node in level_node.find_child(NodeKind.TEMPLATE):
124 if t_node.template_name in FORMS_TABLE_TEMPLATES:
125 if is_first_forms_template: 125 ↛ 130line 125 didn't jump to line 130 because the condition on line 125 was always true
126 is_first_forms_template = False
127 if len(forms_data.forms) > 0:
128 forms_data.forms.clear()
129 forms_data.extracted_vervoeging_page = False
130 extract_inflection_template(
131 wxr,
132 page_data[-1]
133 if title_text.startswith(("Vervoeging", "Verbuiging"))
134 and len(page_data) > 0
135 and page_data[-1].lang_code == base_data.lang_code
136 else forms_data,
137 t_node,
138 )
139 return etymology_data
142def parse_page(
143 wxr: WiktextractContext, page_title: str, page_text: str
144) -> list[dict[str, Any]]:
145 # page layout
146 # https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien
147 # language templates
148 # https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen
149 if page_title.endswith("/vervoeging"): 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 return [] # skip conjugation pages
151 wxr.wtp.start_page(page_title)
152 tree = wxr.wtp.parse(page_text, pre_expand=True)
153 page_data: list[WordEntry] = []
154 for level2_node in tree.find_child(NodeKind.LEVEL2):
155 lang_name = clean_node(wxr, None, level2_node.largs)
156 lang_code = name_to_code(lang_name, "nl")
157 if lang_code == "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 lang_code = "unknown"
159 if ( 159 ↛ 163line 159 didn't jump to line 163 because the condition on line 159 was never true
160 wxr.config.capture_language_codes is not None
161 and lang_code not in wxr.config.capture_language_codes
162 ):
163 continue
164 wxr.wtp.start_section(lang_name)
165 base_data = WordEntry(
166 word=wxr.wtp.title,
167 lang_code=lang_code,
168 lang=lang_name,
169 pos="unknown",
170 )
171 forms_data = base_data.model_copy(deep=True)
172 extract_section_categories(wxr, base_data, level2_node)
173 etymology_data = []
174 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
175 extract_inflection_template(wxr, forms_data, t_node)
176 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
177 new_e_data = parse_section(
178 wxr, page_data, base_data, forms_data, next_level_node
179 )
180 if len(new_e_data) > 0:
181 etymology_data = new_e_data
182 for data in page_data:
183 if data.lang_code == lang_code:
184 for e_data in etymology_data:
185 if (
186 e_data.index == data.etymology_index
187 or e_data.index == ""
188 ):
189 data.etymology_texts.append(e_data.text)
190 data.categories.extend(e_data.categories)
192 for data in page_data:
193 if len(data.senses) == 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 data.senses.append(Sense(tags=["no-gloss"]))
195 return [m.model_dump(exclude_defaults=True) for m in page_data]
198def extract_note_section(
199 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
200) -> None:
201 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
202 note_str = clean_node(wxr, word_entry, list_item.children)
203 if len(note_str) > 0:
204 word_entry.notes.append(note_str)