Coverage for src/wiktextract/extractor/ja/header.py: 98%
77 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])
13def extract_header_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]
15) -> None:
16 extracted_forms = {}
17 use_nodes = []
18 is_first_bold = True
19 for node in nodes:
20 if isinstance(node, TemplateNode) and node.template_name in (
21 "jachar",
22 "kochar",
23 "vichar",
24 "zhchar",
25 ):
26 is_first_bold = False
27 else:
28 use_nodes.append(node)
29 expanded_nodes = wxr.wtp.parse(
30 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True
31 )
32 raw_tags = []
33 for node in expanded_nodes.find_child_recursively(
34 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC
35 ):
36 if (
37 isinstance(node, HTMLNode)
38 and "gender" in node.attrs.get("class", "").split()
39 ):
40 raw_tag_text = clean_node(wxr, None, node)
41 for raw_tag in re.split(r"\s|,", raw_tag_text):
42 raw_tag = raw_tag.strip()
43 if raw_tag != "": 43 ↛ 41line 43 didn't jump to line 41 because the condition on line 43 was always true
44 word_entry.raw_tags.append(raw_tag)
45 if isinstance(node, HTMLNode) and not (
46 node.tag in ["strong", "small", "i", "b"]
47 or "headword" in node.attrs.get("class", "")
48 or "form-of" in node.attrs.get("class", "")
49 ):
50 continue
51 if isinstance(node, HTMLNode) and node.tag in ["small", "i"]:
52 raw_tag = clean_node(wxr, None, node).strip("(): ")
53 if raw_tag != "又は" and raw_tag not in raw_tags:
54 # ignore "又は"(or) in "ja-noun" template
55 raw_tags.append(raw_tag)
56 elif (
57 isinstance(node, HTMLNode)
58 and node.tag == "span"
59 and "form-of" in node.attrs.get("class", "")
60 ):
61 for span_child in node.children:
62 if isinstance(span_child, str) and span_child.strip() != "":
63 raw_tags.append(span_child.strip())
64 elif (
65 isinstance(span_child, WikiNode)
66 and span_child.kind == NodeKind.BOLD
67 ):
68 word = clean_node(wxr, None, span_child)
69 if word != "": 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true
70 add_form_data(
71 node, word, extracted_forms, word_entry, raw_tags
72 )
73 raw_tags.clear()
74 else:
75 form_text = clean_node(wxr, None, node).strip("()【】 ")
76 add_form_data(
77 node,
78 form_text,
79 extracted_forms,
80 word_entry,
81 raw_tags,
82 is_canonical=is_first_bold,
83 )
84 if node.kind == NodeKind.BOLD:
85 is_first_bold = False
86 raw_tags.clear()
87 clean_node(wxr, word_entry, expanded_nodes)
88 if len(raw_tags) > 0:
89 word_entry.raw_tags.extend(raw_tags)
90 translate_raw_tags(word_entry)
93def add_form_data(
94 node: WikiNode,
95 forms_text: str,
96 extracted_forms: dict[str, Form],
97 word_entry: WordEntry,
98 raw_tags: list[str],
99 is_canonical: bool = False,
100) -> None:
101 for form_text in re.split(r"・|、|,|•", forms_text):
102 form_text = form_text.strip()
103 if form_text in extracted_forms:
104 form = extracted_forms[form_text]
105 for raw_tag in raw_tags:
106 if raw_tag not in form.raw_tags: 106 ↛ 105line 106 didn't jump to line 105 because the condition on line 106 was always true
107 form.raw_tags.append(raw_tag)
108 translate_raw_tags(form)
109 continue
110 elif (
111 form_text == word_entry.word
112 or form_text.replace(" ", "") == word_entry.word
113 or len(form_text) == 0
114 ):
115 continue
116 form = Form(
117 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else []
118 )
119 extracted_forms[form_text] = form
120 if (
121 node.kind == NodeKind.BOLD
122 or (isinstance(node, HTMLNode) and node.tag == "strong")
123 ) and is_canonical:
124 form.tags.append("canonical")
125 is_canonical = False
126 if isinstance(node, HTMLNode):
127 class_names = node.attrs.get("class", "")
128 for class_name in FORM_OF_CLASS_TAGS:
129 if class_name in class_names:
130 form.tags.append(class_name)
131 if "tr Latn" in node.attrs.get("class", ""):
132 form.tags.append("transliteration")
133 translate_raw_tags(form)
134 if raw_tags == ["又は"] and len(word_entry.forms) > 0:
135 form.tags = word_entry.forms[-1].tags
136 form.raw_tags = word_entry.forms[-1].raw_tags
137 word_entry.forms.append(form)