Coverage for src / wiktextract / extractor / ja / header.py: 97%
81 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])
13def extract_header_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]
15) -> None:
16 extracted_forms = {}
17 use_nodes = []
18 is_first_bold = True
19 for node in nodes:
20 if isinstance(node, TemplateNode) and node.template_name in (
21 "jachar",
22 "kochar",
23 "vichar",
24 "zhchar",
25 ):
26 is_first_bold = False
27 elif isinstance(node, TemplateNode):
28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true
29 use_nodes.append(node)
30 # ignore other templates, like "wikipedia" and "commonscat"
31 else:
32 use_nodes.append(node)
33 expanded_nodes = wxr.wtp.parse(
34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True
35 )
36 raw_tags = []
37 for node in expanded_nodes.find_child_recursively(
38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC
39 ):
40 if (
41 isinstance(node, HTMLNode)
42 and "gender" in node.attrs.get("class", "").split()
43 ):
44 raw_tag_text = clean_node(wxr, None, node)
45 for raw_tag in re.split(r"\s|,", raw_tag_text):
46 raw_tag = raw_tag.strip()
47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true
48 word_entry.raw_tags.append(raw_tag)
49 if isinstance(node, HTMLNode) and not (
50 node.tag in ["strong", "small", "i", "b"]
51 or "headword" in node.attrs.get("class", "")
52 or "form-of" in node.attrs.get("class", "")
53 ):
54 continue
55 if (isinstance(node, HTMLNode) and node.tag in ["small", "i"]) or (
56 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC
57 ):
58 raw_tag = clean_node(wxr, None, node).strip("(): ")
59 if raw_tag != "又は" and raw_tag not in raw_tags:
60 # ignore "又は"(or) in "ja-noun" template
61 raw_tags.append(raw_tag)
62 elif (
63 isinstance(node, HTMLNode)
64 and node.tag == "span"
65 and "form-of" in node.attrs.get("class", "").split()
66 ):
67 for span_child in node.children:
68 if isinstance(span_child, str) and span_child.strip() != "":
69 raw_tags.append(span_child.strip())
70 elif (
71 isinstance(span_child, WikiNode)
72 and span_child.kind == NodeKind.BOLD
73 ):
74 word = clean_node(wxr, None, span_child)
75 if word != "": 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true
76 add_form_data(
77 node, word, extracted_forms, word_entry, raw_tags
78 )
79 raw_tags.clear()
80 else:
81 form_text = clean_node(wxr, None, node).strip("()【】 ")
82 add_form_data(
83 node,
84 form_text,
85 extracted_forms,
86 word_entry,
87 raw_tags,
88 is_canonical=is_first_bold,
89 )
90 if node.kind == NodeKind.BOLD:
91 is_first_bold = False
92 raw_tags.clear()
93 clean_node(wxr, word_entry, expanded_nodes)
94 if len(raw_tags) > 0:
95 word_entry.raw_tags.extend(raw_tags)
96 translate_raw_tags(word_entry)
99def add_form_data(
100 node: WikiNode,
101 forms_text: str,
102 extracted_forms: dict[str, Form],
103 word_entry: WordEntry,
104 raw_tags: list[str],
105 is_canonical: bool = False,
106) -> None:
107 for form_text in re.split(r"・|、|,|•", forms_text):
108 form_text = form_text.strip()
109 if form_text in extracted_forms:
110 form = extracted_forms[form_text]
111 for raw_tag in raw_tags:
112 if raw_tag not in form.raw_tags: 112 ↛ 111line 112 didn't jump to line 111 because the condition on line 112 was always true
113 form.raw_tags.append(raw_tag)
114 translate_raw_tags(form)
115 continue
116 elif (
117 form_text == word_entry.word
118 or form_text.replace(" ", "") == word_entry.word
119 or len(form_text) == 0
120 ):
121 continue
122 form = Form(
123 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else []
124 )
125 extracted_forms[form_text] = form
126 if (
127 node.kind == NodeKind.BOLD
128 or (isinstance(node, HTMLNode) and node.tag == "strong")
129 ) and is_canonical:
130 form.tags.append("canonical")
131 is_canonical = False
132 if isinstance(node, HTMLNode):
133 class_names = node.attrs.get("class", "")
134 for class_name in FORM_OF_CLASS_TAGS:
135 if class_name in class_names:
136 form.tags.append(class_name)
137 class_name = node.attrs.get("class", "")
138 if "tr Latn" in class_name or "headword-tr" in class_name:
139 form.tags.append("transliteration")
140 translate_raw_tags(form)
141 if raw_tags == ["又は"] and len(word_entry.forms) > 0:
142 form.tags = word_entry.forms[-1].tags
143 form.raw_tags = word_entry.forms[-1].raw_tags
144 word_entry.forms.append(form)