Coverage for src/wiktextract/extractor/ja/header.py: 97%
80 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])
13def extract_header_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]
15) -> None:
16 extracted_forms = {}
17 use_nodes = []
18 is_first_bold = True
19 for node in nodes:
20 if isinstance(node, TemplateNode) and node.template_name in (
21 "jachar",
22 "kochar",
23 "vichar",
24 "zhchar",
25 ):
26 is_first_bold = False
27 elif isinstance(node, TemplateNode):
28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true
29 use_nodes.append(node)
30 # ignore other templates, like "wikipedia" and "commonscat"
31 else:
32 use_nodes.append(node)
33 expanded_nodes = wxr.wtp.parse(
34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True
35 )
36 raw_tags = []
37 for node in expanded_nodes.find_child_recursively(
38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC
39 ):
40 if (
41 isinstance(node, HTMLNode)
42 and "gender" in node.attrs.get("class", "").split()
43 ):
44 raw_tag_text = clean_node(wxr, None, node)
45 for raw_tag in re.split(r"\s|,", raw_tag_text):
46 raw_tag = raw_tag.strip()
47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true
48 word_entry.raw_tags.append(raw_tag)
49 if isinstance(node, HTMLNode) and not (
50 node.tag in ["strong", "small", "i", "b"]
51 or "headword" in node.attrs.get("class", "")
52 or "form-of" in node.attrs.get("class", "")
53 ):
54 continue
55 if isinstance(node, HTMLNode) and node.tag in ["small", "i"]:
56 raw_tag = clean_node(wxr, None, node).strip("(): ")
57 if raw_tag != "又は" and raw_tag not in raw_tags:
58 # ignore "又は"(or) in "ja-noun" template
59 raw_tags.append(raw_tag)
60 elif (
61 isinstance(node, HTMLNode)
62 and node.tag == "span"
63 and "form-of" in node.attrs.get("class", "")
64 ):
65 for span_child in node.children:
66 if isinstance(span_child, str) and span_child.strip() != "":
67 raw_tags.append(span_child.strip())
68 elif (
69 isinstance(span_child, WikiNode)
70 and span_child.kind == NodeKind.BOLD
71 ):
72 word = clean_node(wxr, None, span_child)
73 if word != "": 73 ↛ 77line 73 didn't jump to line 77 because the condition on line 73 was always true
74 add_form_data(
75 node, word, extracted_forms, word_entry, raw_tags
76 )
77 raw_tags.clear()
78 else:
79 form_text = clean_node(wxr, None, node).strip("()【】 ")
80 add_form_data(
81 node,
82 form_text,
83 extracted_forms,
84 word_entry,
85 raw_tags,
86 is_canonical=is_first_bold,
87 )
88 if node.kind == NodeKind.BOLD:
89 is_first_bold = False
90 raw_tags.clear()
91 clean_node(wxr, word_entry, expanded_nodes)
92 if len(raw_tags) > 0:
93 word_entry.raw_tags.extend(raw_tags)
94 translate_raw_tags(word_entry)
97def add_form_data(
98 node: WikiNode,
99 forms_text: str,
100 extracted_forms: dict[str, Form],
101 word_entry: WordEntry,
102 raw_tags: list[str],
103 is_canonical: bool = False,
104) -> None:
105 for form_text in re.split(r"・|、|,|•", forms_text):
106 form_text = form_text.strip()
107 if form_text in extracted_forms:
108 form = extracted_forms[form_text]
109 for raw_tag in raw_tags:
110 if raw_tag not in form.raw_tags: 110 ↛ 109line 110 didn't jump to line 109 because the condition on line 110 was always true
111 form.raw_tags.append(raw_tag)
112 translate_raw_tags(form)
113 continue
114 elif (
115 form_text == word_entry.word
116 or form_text.replace(" ", "") == word_entry.word
117 or len(form_text) == 0
118 ):
119 continue
120 form = Form(
121 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else []
122 )
123 extracted_forms[form_text] = form
124 if (
125 node.kind == NodeKind.BOLD
126 or (isinstance(node, HTMLNode) and node.tag == "strong")
127 ) and is_canonical:
128 form.tags.append("canonical")
129 is_canonical = False
130 if isinstance(node, HTMLNode):
131 class_names = node.attrs.get("class", "")
132 for class_name in FORM_OF_CLASS_TAGS:
133 if class_name in class_names:
134 form.tags.append(class_name)
135 if "tr Latn" in node.attrs.get("class", ""):
136 form.tags.append("transliteration")
137 translate_raw_tags(form)
138 if raw_tags == ["又は"] and len(word_entry.forms) > 0:
139 form.tags = word_entry.forms[-1].tags
140 form.raw_tags = word_entry.forms[-1].raw_tags
141 word_entry.forms.append(form)