Coverage for src/wiktextract/extractor/ms/example.py: 99%
53 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import calculate_bold_offsets
8from .linkage import LINKAGE_TEMPLATES, extract_nyms_template
9from .models import Example, Sense, WordEntry
12def extract_example_list_item(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 sense: Sense,
16 list_item: WikiNode,
17 parent_e_data: Example | None = None,
18) -> None:
19 e_data = Example(text="") if parent_e_data is None else parent_e_data
20 is_first_node = True
21 is_ref = False
22 for node in list_item.children:
23 if isinstance(node, TemplateNode) and (
24 node.template_name
25 in ["cp", "usex", "ux", "ko-usex", "uxi", "quote"]
26 or node.template_name.startswith("quote-")
27 ):
28 extract_cp_template(wxr, sense, node, e_data)
29 elif (
30 isinstance(node, TemplateNode)
31 and node.template_name in LINKAGE_TEMPLATES
32 ):
33 extract_nyms_template(wxr, word_entry, node)
34 elif isinstance(node, WikiNode):
35 if node.kind == NodeKind.ITALIC and not is_ref:
36 if parent_e_data is None:
37 e_data.text = clean_node(wxr, sense, node)
38 calculate_bold_offsets(
39 wxr, node, e_data.text, e_data, "bold_text_offsets"
40 )
41 else:
42 e_data.translation = clean_node(wxr, sense, node)
43 calculate_bold_offsets(
44 wxr,
45 node,
46 e_data.translation,
47 e_data,
48 "bold_translation_offsets",
49 )
50 elif node.kind == NodeKind.LIST:
51 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
52 extract_example_list_item(
53 wxr, word_entry, sense, child_list_item, e_data
54 )
55 elif is_first_node and node.kind == NodeKind.BOLD:
56 bold_text = clean_node(wxr, None, node)
57 if re.fullmatch(r"\d{4}", bold_text): 57 ↛ 64line 57 didn't jump to line 64 because the condition on line 57 was always true
58 e_data.ref = clean_node(
59 wxr,
60 sense,
61 list(list_item.invert_find_child(NodeKind.LIST)),
62 )
63 is_ref = True
64 is_first_node = False
66 if e_data.text != "" and parent_e_data is None:
67 sense.examples.append(e_data)
70def extract_cp_template(
71 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, e_data: Example
72) -> None:
73 expanded_template = wxr.wtp.parse(
74 wxr.wtp.node_to_wikitext(t_node), expand_all=True
75 )
76 for html_tag in expanded_template.find_child_recursively(NodeKind.HTML):
77 html_class = html_tag.attrs.get("class", "")
78 if "e-example" in html_class or "e-quotation" in html_class:
79 e_data.text = clean_node(wxr, None, html_tag)
80 calculate_bold_offsets(
81 wxr, html_tag, e_data.text, e_data, "bold_text_offsets"
82 )
83 elif "e-transliteration" in html_class:
84 e_data.roman = clean_node(wxr, None, html_tag)
85 calculate_bold_offsets(
86 wxr, html_tag, e_data.roman, e_data, "bold_roman_offsets"
87 )
88 elif "e-translation" in html_class:
89 e_data.translation = clean_node(wxr, None, html_tag)
90 calculate_bold_offsets(
91 wxr,
92 html_tag,
93 e_data.translation,
94 e_data,
95 "bold_translation_offsets",
96 )
97 elif "e-literally" in html_class:
98 e_data.literal_meaning = clean_node(wxr, None, html_tag)
99 calculate_bold_offsets(
100 wxr,
101 html_tag,
102 e_data.literal_meaning,
103 e_data,
104 "bold_literal_offsets",
105 )
106 elif "cited-source" in html_class:
107 e_data.ref = clean_node(wxr, None, html_tag)
109 clean_node(wxr, sense, expanded_template)