Coverage for src/wiktextract/extractor/ms/example.py: 99%
53 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import calculate_bold_offsets
8from .linkage import LINKAGE_TEMPLATES, extract_nyms_template
9from .models import Example, Sense, WordEntry
12def extract_example_list_item(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 sense: Sense,
16 list_item: WikiNode,
17 parent_e_data: Example | None = None,
18) -> None:
19 e_data = Example(text="") if parent_e_data is None else parent_e_data
20 is_first_node = True
21 is_ref = False
22 for node in list_item.children:
23 if isinstance(node, TemplateNode) and (
24 node.template_name
25 in ["cp", "usex", "ux", "ko-usex", "uxi", "quote"]
26 or node.template_name.startswith("quote-")
27 ):
28 extract_cp_template(wxr, sense, node, e_data)
29 elif (
30 isinstance(node, TemplateNode)
31 and node.template_name in LINKAGE_TEMPLATES
32 ):
33 extract_nyms_template(wxr, word_entry, node)
34 elif isinstance(node, WikiNode):
35 if node.kind == NodeKind.ITALIC and not is_ref:
36 if parent_e_data is None:
37 e_data.text = clean_node(wxr, sense, node)
38 calculate_bold_offsets(
39 wxr, node, e_data.text, e_data, "bold_text_offsets"
40 )
41 else:
42 e_data.translation = clean_node(wxr, sense, node)
43 calculate_bold_offsets(
44 wxr,
45 node,
46 e_data.translation,
47 e_data,
48 "bold_translation_offsets",
49 )
50 elif node.kind == NodeKind.LIST:
51 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
52 extract_example_list_item(
53 wxr, word_entry, sense, child_list_item, e_data
54 )
55 elif is_first_node and node.kind == NodeKind.BOLD:
56 bold_text = clean_node(wxr, None, node)
57 if re.fullmatch(r"\d{4}", bold_text): 57 ↛ 68line 57 didn't jump to line 68 because the condition on line 57 was always true
58 e_data.ref = clean_node(
59 wxr,
60 sense,
61 list(
62 list_item.invert_find_child(
63 NodeKind.LIST, include_empty_str=True
64 )
65 ),
66 )
67 is_ref = True
68 is_first_node = False
70 if e_data.text != "" and parent_e_data is None:
71 sense.examples.append(e_data)
74def extract_cp_template(
75 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, e_data: Example
76) -> None:
77 expanded_template = wxr.wtp.parse(
78 wxr.wtp.node_to_wikitext(t_node), expand_all=True
79 )
80 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):
81 html_class = html_tag.attrs.get("class", "")
82 if "e-example" in html_class or "e-quotation" in html_class:
83 e_data.text = clean_node(wxr, None, html_tag)
84 calculate_bold_offsets(
85 wxr, html_tag, e_data.text, e_data, "bold_text_offsets"
86 )
87 elif "e-transliteration" in html_class:
88 e_data.roman = clean_node(wxr, None, html_tag)
89 calculate_bold_offsets(
90 wxr, html_tag, e_data.roman, e_data, "bold_roman_offsets"
91 )
92 elif "e-translation" in html_class:
93 e_data.translation = clean_node(wxr, None, html_tag)
94 calculate_bold_offsets(
95 wxr,
96 html_tag,
97 e_data.translation,
98 e_data,
99 "bold_translation_offsets",
100 )
101 elif "e-literally" in html_class:
102 e_data.literal_meaning = clean_node(wxr, None, html_tag)
103 calculate_bold_offsets(
104 wxr,
105 html_tag,
106 e_data.literal_meaning,
107 e_data,
108 "bold_literal_offsets",
109 )
110 elif "cited-source" in html_class:
111 e_data.ref = clean_node(wxr, None, html_tag)
113 clean_node(wxr, sense, expanded_template)