Coverage for src / wiktextract / extractor / ja / header.py: 96%
87 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1import re
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Classifier, Form, WordEntry
8from .tags import translate_raw_tags
10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])
13def extract_header_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]
15) -> None:
16 extracted_forms = {}
17 use_nodes = []
18 is_first_bold = True
19 for node in nodes:
20 if isinstance(node, TemplateNode) and node.template_name in (
21 "jachar",
22 "kochar",
23 "vichar",
24 "zhchar",
25 ):
26 is_first_bold = False
27 elif isinstance(node, TemplateNode):
28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true
29 use_nodes.append(node)
30 # ignore other templates, like "wikipedia" and "commonscat"
31 else:
32 use_nodes.append(node)
33 expanded_nodes = wxr.wtp.parse(
34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True
35 )
36 raw_tags = []
37 for node in expanded_nodes.find_child_recursively(
38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC
39 ):
40 if (
41 isinstance(node, HTMLNode)
42 and "gender" in node.attrs.get("class", "").split()
43 ):
44 raw_tag_text = clean_node(wxr, None, node)
45 for raw_tag in re.split(r"\s|,", raw_tag_text):
46 raw_tag = raw_tag.strip()
47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true
48 word_entry.raw_tags.append(raw_tag)
49 if isinstance(node, HTMLNode) and not (
50 node.tag in ["strong", "small", "i", "b"]
51 or "headword" in node.attrs.get("class", "")
52 or "form-of" in node.attrs.get("class", "")
53 ):
54 continue
55 if (isinstance(node, HTMLNode) and node.tag in ["small", "i"]) or (
56 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC
57 ):
58 raw_tag = clean_node(wxr, None, node).strip("(): ")
59 if raw_tag not in raw_tags:
60 raw_tags.append(raw_tag)
61 elif (
62 isinstance(node, HTMLNode)
63 and node.tag == "span"
64 and "form-of" in node.attrs.get("class", "").split()
65 ):
66 for span_child in node.children:
67 if isinstance(span_child, str) and span_child.strip() != "":
68 raw_tags.append(span_child.strip())
69 elif (
70 isinstance(span_child, WikiNode)
71 and span_child.kind == NodeKind.BOLD
72 ):
73 word = clean_node(wxr, None, span_child)
74 if word != "": 74 ↛ 78line 74 didn't jump to line 78 because the condition on line 74 was always true
75 add_form_data(
76 node, word, extracted_forms, word_entry, raw_tags
77 )
78 raw_tags.clear()
79 else:
80 form_text = clean_node(wxr, None, node).strip("()【】 ")
81 add_form_data(
82 node,
83 form_text,
84 extracted_forms,
85 word_entry,
86 raw_tags,
87 is_canonical=is_first_bold,
88 )
89 if node.kind == NodeKind.BOLD:
90 is_first_bold = False
91 raw_tags.clear()
92 new_forms = []
93 for form in word_entry.forms:
94 if "類別詞" in form.raw_tags: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 word_entry.classifiers.append(
96 Classifier(
97 classifier=form.form, tags=form.tags, raw_tags=form.raw_tags
98 )
99 )
100 else:
101 new_forms.append(form)
102 word_entry.forms = new_forms
103 clean_node(wxr, word_entry, expanded_nodes)
104 if len(raw_tags) > 0:
105 word_entry.raw_tags.extend(raw_tags)
106 translate_raw_tags(word_entry)
109def add_form_data(
110 node: WikiNode,
111 forms_text: str,
112 extracted_forms: dict[str, Form],
113 word_entry: WordEntry,
114 raw_tags: list[str],
115 is_canonical: bool = False,
116) -> None:
117 for form_text in re.split(r"・|、|,|•", forms_text):
118 form_text = form_text.strip()
119 if form_text in extracted_forms:
120 form = extracted_forms[form_text]
121 for raw_tag in raw_tags:
122 if raw_tag not in form.raw_tags: 122 ↛ 121line 122 didn't jump to line 121 because the condition on line 122 was always true
123 form.raw_tags.append(raw_tag)
124 translate_raw_tags(form)
125 continue
126 elif (
127 form_text == word_entry.word
128 or form_text.replace(" ", "") == word_entry.word
129 or len(form_text) == 0
130 ):
131 continue
132 form = Form(
133 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else []
134 )
135 extracted_forms[form_text] = form
136 if (
137 node.kind == NodeKind.BOLD
138 or (isinstance(node, HTMLNode) and node.tag == "strong")
139 ) and is_canonical:
140 form.tags.append("canonical")
141 is_canonical = False
142 if isinstance(node, HTMLNode):
143 class_names = node.attrs.get("class", "")
144 for class_name in FORM_OF_CLASS_TAGS:
145 if class_name in class_names:
146 form.tags.append(class_name)
147 class_name = node.attrs.get("class", "")
148 if "tr Latn" in class_name or "headword-tr" in class_name:
149 form.tags.append("transliteration")
150 translate_raw_tags(form)
151 if raw_tags == ["又は"] and len(word_entry.forms) > 0:
152 form.tags = word_entry.forms[-1].tags
153 form.raw_tags = word_entry.forms[-1].raw_tags
154 word_entry.forms.append(form)