Coverage for src/wiktextract/extractor/de/example.py: 85%
80 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 08:19 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 08:19 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import calculate_bold_offsets
8from .models import Example, Sense, WordEntry
9from .tags import translate_raw_tags
10from .utils import extract_sense_index
12LITERATUR_TEMPLATE_ARGS = {
13 "autor": "author",
14 "a": "author",
15 "titel": "title",
16 "titelerg": "title_complement",
17 "auflage": "edition",
18 "verlag": "publisher",
19 "ort": "place",
20 "jahr": "year",
21 "seiten": "pages",
22 "isbn": "isbn",
23 "übersetzer": "translator",
24 "herausgeber": "editor",
25 "sammelwerk": "collection",
26 "werk": "collection",
27 "band": "volume",
28 "kommentar": "comment",
29 "online": "url",
30 "tag": "day",
31 "monat": "month",
32 "zugriff": "accessdate",
33 "nummer": "number",
34 "datum": "date",
35 "hrsg": "editor",
36}
39def extract_examples(
40 wxr: WiktextractContext,
41 word_entry: WordEntry,
42 level_node: LevelNode,
43) -> None:
44 last_example = None
45 raw_tags = []
46 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
47 if not list_item_node.sarg.endswith(":"):
48 raw_tags.clear()
49 raw_tag = clean_node(wxr, None, list_item_node.children)
50 raw_tag = raw_tag.strip(": ")
51 if raw_tag != "": 51 ↛ 46line 51 didn't jump to line 46 because the condition on line 51 was always true
52 raw_tags.append(raw_tag)
53 else:
54 example_data = Example(raw_tags=raw_tags)
55 for ref_tag in list_item_node.find_html("ref"):
56 extract_reference(wxr, example_data, ref_tag)
57 example_text_node = wxr.wtp.parse(
58 wxr.wtp.node_to_wikitext(
59 list(
60 list_item_node.invert_find_child(
61 NodeKind.LIST, include_empty_str=True
62 )
63 )
64 )
65 )
66 example_text = clean_node(wxr, None, example_text_node)
67 sense_idx, example_text = extract_sense_index(example_text)
68 if len(example_text) > 0:
69 translate_raw_tags(example_data)
70 example_data.text = example_text
71 calculate_bold_offsets(
72 wxr,
73 example_text_node,
74 example_text,
75 example_data,
76 "italic_text_offsets",
77 extra_node_kind=NodeKind.ITALIC,
78 )
79 if len(sense_idx) > 0:
80 find_sense = False
81 for sense in word_entry.senses:
82 if match_sense_index(sense_idx, sense):
83 sense.examples.append(example_data)
84 find_sense = True
85 if not find_sense:
86 new_sense = Sense(
87 sense_index=sense_idx, tags=["no-gloss"]
88 )
89 new_sense.examples.append(example_data)
90 word_entry.senses.append(new_sense)
91 last_example = example_data
92 elif last_example is not None: 92 ↛ 103line 92 didn't jump to line 103 because the condition on line 92 was always true
93 last_example.translation = example_text
94 calculate_bold_offsets(
95 wxr,
96 example_text_node,
97 example_text,
98 example_data,
99 "italic_translation_offsets",
100 extra_node_kind=NodeKind.ITALIC,
101 )
102 else:
103 wxr.wtp.debug(
104 f"Found example data without senseid: {example_data}",
105 sortid="extractor/de/examples/extract_examples/28",
106 )
107 last_example = None
109 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 109 ↛ 110line 109 didn't jump to line 110 because the loop on line 109 never started
110 wxr.wtp.debug(
111 f"Found unexpected non-list node in examples: {non_list_node}",
112 sortid="extractor/de/examples/extract_examples/33",
113 )
116def extract_reference(
117 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
118):
119 example_data.ref = clean_node(wxr, None, ref_node.children)
120 for template_node in ref_node.find_child(NodeKind.TEMPLATE):
121 if template_node.template_name == "Literatur":
122 # https://de.wiktionary.org/wiki/Vorlage:Literatur
123 for key, value in template_node.template_parameters.items():
124 if not isinstance(key, str):
125 continue
126 if key.lower() in LITERATUR_TEMPLATE_ARGS: 126 ↛ 132line 126 didn't jump to line 132 because the condition on line 126 was always true
127 field = LITERATUR_TEMPLATE_ARGS[key.lower()]
128 if hasattr(example_data, field): 128 ↛ 123line 128 didn't jump to line 123 because the condition on line 128 was always true
129 setattr(
130 example_data, field, clean_node(wxr, None, value)
131 )
132 elif isinstance(key, str):
133 wxr.wtp.debug(
134 f"Unexpected key in Literatur template: {key}",
135 sortid="extractor/de/examples/extract_examples/77",
136 )
138 # XXX: Treat other templates as well.
139 # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID
142def match_sense_index(sense_idx: str, sense: Sense) -> bool:
143 exact_match = not (
144 "," in sense_idx or "-" in sense_idx or "." not in sense_idx
145 )
146 if exact_match: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 return sense_idx == sense.sense_index
149 if sense_idx == sense.sense_index:
150 return True
151 first_number_str = re.split(r",|\.|-|–", sense.sense_index, maxsplit=1)[0]
152 first_number = 0
153 if first_number_str.isdigit():
154 first_number = int(first_number_str)
155 else:
156 return False
158 for try_idx in sense_idx.split(","):
159 try_idx = try_idx.strip()
160 if try_idx == sense.sense_index:
161 return True
162 elif re.fullmatch(r"\d+[\-–]\d+", try_idx): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 start_str, end_str = re.split(r"-|–", try_idx, maxsplit=1)
164 if int(start_str) <= first_number and first_number <= int(end_str):
165 return True
167 return False