Coverage for src / wiktextract / extractor / de / example.py: 86%
68 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import calculate_bold_offsets
8from .models import Example, Sense, WordEntry
9from .tags import translate_raw_tags
10from .utils import extract_sense_index
13def extract_examples(
14 wxr: WiktextractContext,
15 word_entry: WordEntry,
16 level_node: LevelNode,
17) -> None:
18 last_example = None
19 raw_tags = []
20 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
21 if not list_item_node.sarg.endswith(":"):
22 raw_tags.clear()
23 raw_tag = clean_node(wxr, None, list_item_node.children)
24 raw_tag = raw_tag.strip(": ")
25 if raw_tag != "": 25 ↛ 20line 25 didn't jump to line 20 because the condition on line 25 was always true
26 raw_tags.append(raw_tag)
27 else:
28 example_data = Example(text="", raw_tags=raw_tags)
29 for ref_tag in list_item_node.find_html("ref"):
30 example_data.ref = clean_node(wxr, None, ref_tag.children)
31 example_text_node = wxr.wtp.parse(
32 wxr.wtp.node_to_wikitext(
33 list(
34 list_item_node.invert_find_child(
35 NodeKind.LIST, include_empty_str=True
36 )
37 )
38 )
39 )
40 example_text = clean_node(wxr, None, example_text_node)
41 sense_idx, example_text = extract_sense_index(example_text)
42 if len(example_text) > 0:
43 translate_raw_tags(example_data)
44 example_data.text = example_text
45 calculate_bold_offsets(
46 wxr,
47 example_text_node,
48 example_text,
49 example_data,
50 "bold_text_offsets",
51 extra_node_kind=NodeKind.ITALIC,
52 )
53 if len(sense_idx) > 0:
54 find_sense = False
55 for sense in word_entry.senses:
56 if match_sense_index(sense_idx, sense):
57 sense.examples.append(example_data)
58 find_sense = True
59 if not find_sense:
60 new_sense = Sense(
61 sense_index=sense_idx, tags=["no-gloss"]
62 )
63 new_sense.examples.append(example_data)
64 word_entry.senses.append(new_sense)
65 last_example = example_data
66 elif last_example is not None: 66 ↛ 77line 66 didn't jump to line 77 because the condition on line 66 was always true
67 last_example.translation = example_text
68 calculate_bold_offsets(
69 wxr,
70 example_text_node,
71 example_text,
72 example_data,
73 "bold_translation_offsets",
74 extra_node_kind=NodeKind.ITALIC,
75 )
76 else:
77 wxr.wtp.debug(
78 f"Found example data without senseid: {example_data}",
79 sortid="extractor/de/examples/extract_examples/28",
80 )
81 last_example = None
83 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 83 ↛ 84line 83 didn't jump to line 84 because the loop on line 83 never started
84 wxr.wtp.debug(
85 f"Found unexpected non-list node in examples: {non_list_node}",
86 sortid="extractor/de/examples/extract_examples/33",
87 )
90def extract_reference(
91 wxr: WiktextractContext, example_data: Example, ref_node: WikiNode
92):
93 example_data.ref = clean_node(wxr, None, ref_node.children)
96def match_sense_index(sense_idx: str, sense: Sense) -> bool:
97 exact_match = not (
98 "," in sense_idx or "-" in sense_idx or "." not in sense_idx
99 )
100 if exact_match: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 return sense_idx == sense.sense_index
103 if sense_idx == sense.sense_index:
104 return True
105 first_number_str = re.split(r",|\.|-|–", sense.sense_index, maxsplit=1)[0]
106 first_number = 0
107 if first_number_str.isdigit():
108 first_number = int(first_number_str)
109 else:
110 return False
112 for try_idx in sense_idx.split(","):
113 try_idx = try_idx.strip()
114 if try_idx == sense.sense_index:
115 return True
116 elif re.fullmatch(r"\d+[\-–]\d+", try_idx): 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 start_str, end_str = re.split(r"-|–", try_idx, maxsplit=1)
118 if int(start_str) <= first_number and first_number <= int(end_str):
119 return True
121 return False