Coverage for src/wiktextract/extractor/de/example.py: 84%
77 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Example, Sense, WordEntry
8from .tags import translate_raw_tags
9from .utils import extract_sense_index
11LITERATUR_TEMPLATE_ARGS = {
12 "autor": "author",
13 "a": "author",
14 "titel": "title",
15 "titelerg": "title_complement",
16 "auflage": "edition",
17 "verlag": "publisher",
18 "ort": "place",
19 "jahr": "year",
20 "seiten": "pages",
21 "isbn": "isbn",
22 "übersetzer": "translator",
23 "herausgeber": "editor",
24 "sammelwerk": "collection",
25 "werk": "collection",
26 "band": "volume",
27 "kommentar": "comment",
28 "online": "url",
29 "tag": "day",
30 "monat": "month",
31 "zugriff": "accessdate",
32 "nummer": "number",
33 "datum": "date",
34 "hrsg": "editor",
35}
38def extract_examples(
39 wxr: WiktextractContext,
40 page_data: list[WordEntry],
41 level_node: LevelNode,
42) -> None:
43 last_example = None
44 raw_tags = []
45 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
46 if not list_item_node.sarg.endswith(":"):
47 raw_tags.clear()
48 raw_tag = clean_node(wxr, None, list_item_node.children)
49 raw_tag = raw_tag.strip(": ")
50 if raw_tag != "": 50 ↛ 45line 50 didn't jump to line 45 because the condition on line 50 was always true
51 raw_tags.append(raw_tag)
52 else:
53 example_data = Example(raw_tags=raw_tags)
54 for ref_tag in list_item_node.find_html("ref"):
55 extract_reference(wxr, example_data, ref_tag)
56 example_text = clean_node(
57 wxr, None, list(list_item_node.invert_find_child(NodeKind.LIST))
58 )
59 sense_idx, example_text = extract_sense_index(example_text)
60 if len(example_text) > 0:
61 translate_raw_tags(example_data)
62 example_data.text = example_text
63 if len(sense_idx) > 0:
64 find_sense = False
65 for word_entry in page_data:
66 for sense in word_entry.senses:
67 if match_sense_index(sense_idx, sense):
68 sense.examples.append(example_data)
69 find_sense = True
70 if not find_sense:
71 new_sense = Sense(
72 sense_index=sense_idx, tags=["no-gloss"]
73 )
74 new_sense.examples.append(example_data)
75 word_entry.senses.append(new_sense)
76 last_example = example_data
77 elif last_example is not None: 77 ↛ 80line 77 didn't jump to line 80 because the condition on line 77 was always true
78 last_example.translation = example_text
79 else:
80 wxr.wtp.debug(
81 f"Found example data without senseid: {example_data}",
82 sortid="extractor/de/examples/extract_examples/28",
83 )
84 last_example = None
86 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 86 ↛ 87line 86 didn't jump to line 87 because the loop on line 86 never started
87 wxr.wtp.debug(
88 f"Found unexpected non-list node in examples: {non_list_node}",
89 sortid="extractor/de/examples/extract_examples/33",
90 )
93def extract_reference(
94 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
95):
96 example_data.ref = clean_node(wxr, None, ref_node.children)
97 for template_node in ref_node.find_child(NodeKind.TEMPLATE):
98 if template_node.template_name == "Literatur":
99 # https://de.wiktionary.org/wiki/Vorlage:Literatur
100 for key, value in template_node.template_parameters.items():
101 if not isinstance(key, str):
102 continue
103 if key.lower() in LITERATUR_TEMPLATE_ARGS: 103 ↛ 109line 103 didn't jump to line 109 because the condition on line 103 was always true
104 field = LITERATUR_TEMPLATE_ARGS[key.lower()]
105 if field in example_data.model_fields: 105 ↛ 100line 105 didn't jump to line 100 because the condition on line 105 was always true
106 setattr(
107 example_data, field, clean_node(wxr, None, value)
108 )
109 elif isinstance(key, str):
110 wxr.wtp.debug(
111 f"Unexpected key in Literatur template: {key}",
112 sortid="extractor/de/examples/extract_examples/77",
113 )
115 # XXX: Treat other templates as well.
116 # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID
119def match_sense_index(sense_idx: str, sense: Sense) -> bool:
120 exact_match = not (
121 "," in sense_idx or "-" in sense_idx or "." not in sense_idx
122 )
123 if exact_match: 123 ↛ 124line 123 didn't jump to line 124 because the condition on line 123 was never true
124 return sense_idx == sense.sense_index
126 if sense_idx == sense.sense_index:
127 return True
128 first_number_str = re.split(r",|\.|-", sense.sense_index, 1)[0]
129 first_number = 0
130 if first_number_str.isdigit():
131 first_number = int(first_number_str)
132 else:
133 return False
135 for try_idx in sense_idx.split(","):
136 try_idx = try_idx.strip()
137 if try_idx == sense.sense_index:
138 return True
139 elif re.fullmatch(r"\d+-\d+", try_idx): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 start_str, end_str = try_idx.split("-")
141 if int(start_str) <= first_number and first_number <= int(end_str):
142 return True
144 return False