Coverage for src/wiktextract/extractor/ja/header.py: 100%
49 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])
13def extract_header_nodes(
14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]
15) -> None:
16 extracted_forms = set()
17 use_nodes = []
18 is_first_bold = True
19 for node in nodes:
20 if isinstance(node, TemplateNode) and node.template_name in (
21 "jachar",
22 "kochar",
23 "vichar",
24 "zhchar",
25 ):
26 is_first_bold = False
27 else:
28 use_nodes.append(node)
29 expanded_nodes = wxr.wtp.parse(
30 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True
31 )
32 raw_tags = []
33 for node in expanded_nodes.find_child_recursively(
34 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC
35 ):
36 if isinstance(node, HTMLNode) and not (
37 node.tag in ["strong", "small"]
38 or "headword" in node.attrs.get("class", "")
39 or "form-of" in node.attrs.get("class", "")
40 ):
41 continue
42 if isinstance(node, HTMLNode) and node.tag == "small":
43 raw_tag = clean_node(wxr, None, node).strip("(): ")
44 if raw_tag != "又は" and raw_tag not in raw_tags:
45 # ignore "又は"(or) in "ja-noun" template
46 raw_tags.append(raw_tag)
47 else:
48 form_text = clean_node(wxr, None, node).strip("()【】 ")
49 add_form_data(
50 node,
51 form_text,
52 extracted_forms,
53 word_entry,
54 raw_tags,
55 is_canonical=is_first_bold,
56 )
57 if node.kind == NodeKind.BOLD:
58 is_first_bold = False
59 raw_tags.clear()
60 texts = clean_node(wxr, word_entry, expanded_nodes)
61 for form_text in re.findall(r"[(【][^()【】]+[)】]", texts):
62 add_form_data(
63 expanded_nodes,
64 form_text.strip("()【】 "),
65 extracted_forms,
66 word_entry,
67 [],
68 )
71def add_form_data(
72 node: WikiNode,
73 forms_text: str,
74 extracted_forms: set[str],
75 word_entry: WordEntry,
76 raw_tags: list[str],
77 is_canonical: bool = False,
78) -> None:
79 for form_text in re.split(r"・|、|,", forms_text):
80 form_text = form_text.strip()
81 if (
82 form_text == word_entry.word
83 or form_text.replace(" ", "") == word_entry.word
84 or len(form_text) == 0
85 or form_text in extracted_forms
86 ):
87 continue
88 extracted_forms.add(form_text)
89 form = Form(form=form_text, raw_tags=raw_tags)
90 if (
91 node.kind == NodeKind.BOLD
92 or (isinstance(node, HTMLNode) and node.tag == "strong")
93 ) and is_canonical:
94 form.tags.append("canonical")
95 is_canonical = False
96 if isinstance(node, HTMLNode):
97 class_names = node.attrs.get("class", "")
98 for class_name in FORM_OF_CLASS_TAGS:
99 if class_name in class_names:
100 form.tags.append(class_name)
101 translate_raw_tags(form)
102 word_entry.forms.append(form)