Coverage for src/wiktextract/extractor/nl/page.py: 77%
92 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from typing import Any
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .descendant import extract_descendant_section
14from .etymology import extract_etymology_section
15from .inflection import extract_inflection_template
16from .linkage import extract_fixed_preposition_section, extract_linkage_section
17from .models import Etymology, Sense, WordEntry
18from .pos import extract_pos_section
19from .section_titles import LINKAGE_SECTIONS, POS_DATA
20from .sound import extract_hyphenation_section, extract_sound_section
21from .spelling_form import extract_spelling_form_section
22from .translation import extract_translation_section
25def extract_section_categories(
26 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
27) -> None:
28 for link_node in level_node.find_child(NodeKind.LINK):
29 clean_node(wxr, word_entry, link_node)
32def parse_section(
33 wxr: WiktextractContext,
34 page_data: list[WordEntry],
35 base_data: WordEntry,
36 forms_data: WordEntry,
37 level_node: WikiNode,
38) -> list[Etymology]:
39 # title templates
40 # https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
41 title_text = clean_node(wxr, None, level_node.largs)
42 wxr.wtp.start_subsection(title_text)
43 etymology_data = []
44 if title_text in POS_DATA:
45 extract_pos_section(
46 wxr, page_data, base_data, forms_data, level_node, title_text
47 )
48 elif title_text == "Uitspraak":
49 extract_sound_section(
50 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
51 )
52 elif title_text in LINKAGE_SECTIONS:
53 extract_linkage_section(
54 wxr,
55 page_data[-1] if len(page_data) > 0 else base_data,
56 level_node,
57 LINKAGE_SECTIONS[title_text],
58 )
59 elif title_text == "Vertalingen":
60 extract_translation_section(
61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
62 )
63 elif title_text == "Woordafbreking":
64 extract_hyphenation_section(
65 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
66 )
67 elif title_text == "Woordherkomst en -opbouw":
68 etymology_data = extract_etymology_section(wxr, level_node)
69 elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]:
70 extract_spelling_form_section(
71 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
72 )
73 elif title_text == "Opmerkingen": 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 extract_note_section(
75 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
76 )
77 elif title_text == "Overerving en ontlening": 77 ↛ 81line 77 didn't jump to line 81 because the condition on line 77 was always true
78 extract_descendant_section(
79 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
80 )
81 elif title_text == "Vaste voorzetsels":
82 extract_fixed_preposition_section(
83 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
84 )
85 elif title_text == "Vervoeging":
86 pass # conjugation
87 elif title_text == "Verbuiging":
88 pass # inflection
89 elif title_text in [
90 "Gangbaarheid",
91 "Meer informatie",
92 "Verwijzingen",
93 "Citaten",
94 ]:
95 pass # ignore
96 else:
97 wxr.wtp.debug(f"unknown title: {title_text}", sortid="nl/page/60")
99 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
100 parse_section(wxr, page_data, base_data, forms_data, next_level)
101 extract_section_categories(
102 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
103 )
104 for t_node in level_node.find_child(NodeKind.TEMPLATE):
105 extract_inflection_template(wxr, forms_data, t_node)
106 return etymology_data
109def parse_page(
110 wxr: WiktextractContext, page_title: str, page_text: str
111) -> list[dict[str, Any]]:
112 # page layout
113 # https://nl.wiktionary.org/wiki/WikiWoordenboek:Stramien
114 # language templates
115 # https://nl.wiktionary.org/wiki/Categorie:Hoofdtaalsjablonen
116 if page_title.endswith("/vervoeging"): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 return [] # skip conjugation pages
118 wxr.wtp.start_page(page_title)
119 tree = wxr.wtp.parse(page_text, pre_expand=True)
120 page_data: list[WordEntry] = []
121 for level2_node in tree.find_child(NodeKind.LEVEL2):
122 lang_name = clean_node(wxr, None, level2_node.largs)
123 lang_code = name_to_code(lang_name, "nl")
124 if lang_code == "": 124 ↛ 125line 124 didn't jump to line 125 because the condition on line 124 was never true
125 lang_code = "unknown"
126 if ( 126 ↛ 130line 126 didn't jump to line 130
127 wxr.config.capture_language_codes is not None
128 and lang_code not in wxr.config.capture_language_codes
129 ):
130 continue
131 wxr.wtp.start_section(lang_name)
132 base_data = WordEntry(
133 word=wxr.wtp.title,
134 lang_code=lang_code,
135 lang=lang_name,
136 pos="unknown",
137 )
138 forms_data = base_data.model_copy(deep=True)
139 extract_section_categories(wxr, base_data, level2_node)
140 etymology_data = []
141 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
142 new_e_data = parse_section(
143 wxr, page_data, base_data, forms_data, next_level_node
144 )
145 if len(new_e_data) > 0:
146 etymology_data = new_e_data
147 for data in page_data:
148 if data.lang_code == lang_code: 148 ↛ 147line 148 didn't jump to line 147 because the condition on line 148 was always true
149 for e_data in etymology_data:
150 if (
151 e_data.index == data.etymology_index
152 or e_data.index == ""
153 ):
154 data.etymology_texts.append(e_data.text)
155 data.categories.extend(e_data.categories)
157 for data in page_data:
158 if len(data.senses) == 0:
159 data.senses.append(Sense(tags=["no-gloss"]))
160 return [m.model_dump(exclude_defaults=True) for m in page_data]
163def extract_note_section(
164 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
165) -> None:
166 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
167 note_str = clean_node(wxr, word_entry, list_item.children)
168 if len(note_str) > 0:
169 word_entry.notes.append(note_str)