Coverage for src/wiktextract/extractor/ja/header.py: 98%

1import re

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, WordEntry

8from .tags import translate_raw_tags

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"])

13def extract_header_nodes(

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode]

15) -> None:

16 extracted_forms = {}

17 use_nodes = []

18 is_first_bold = True

19 for node in nodes:

20 if isinstance(node, TemplateNode) and node.template_name in (

21 "jachar",

22 "kochar",

23 "vichar",

24 "zhchar",

25 ):

26 is_first_bold = False

27 else:

28 use_nodes.append(node)

29 expanded_nodes = wxr.wtp.parse(

30 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True

31 )

32 raw_tags = []

33 for node in expanded_nodes.find_child_recursively(

34 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC

35 ):

36 if (

37 isinstance(node, HTMLNode)

38 and "gender" in node.attrs.get("class", "").split()

39 ):

40 raw_tag_text = clean_node(wxr, None, node)

41 for raw_tag in re.split(r"\s|,", raw_tag_text):

42 raw_tag = raw_tag.strip()

43 if raw_tag != "": 43 ↛ 41line 43 didn't jump to line 41 because the condition on line 43 was always true

44 word_entry.raw_tags.append(raw_tag)

45 if isinstance(node, HTMLNode) and not (

46 node.tag in ["strong", "small", "i", "b"]

47 or "headword" in node.attrs.get("class", "")

48 or "form-of" in node.attrs.get("class", "")

49 ):

50 continue

51 if isinstance(node, HTMLNode) and node.tag in ["small", "i"]:

52 raw_tag = clean_node(wxr, None, node).strip("(): ")

53 if raw_tag != "又は" and raw_tag not in raw_tags:

54 # ignore "又は"(or) in "ja-noun" template

55 raw_tags.append(raw_tag)

56 elif (

57 isinstance(node, HTMLNode)

58 and node.tag == "span"

59 and "form-of" in node.attrs.get("class", "")

60 ):

61 for span_child in node.children:

62 if isinstance(span_child, str) and span_child.strip() != "":

63 raw_tags.append(span_child.strip())

64 elif (

65 isinstance(span_child, WikiNode)

66 and span_child.kind == NodeKind.BOLD

67 ):

68 word = clean_node(wxr, None, span_child)

69 if word != "": 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true

70 add_form_data(

71 node, word, extracted_forms, word_entry, raw_tags

72 )

73 raw_tags.clear()

74 else:

75 form_text = clean_node(wxr, None, node).strip("（）【】 ")

76 add_form_data(

77 node,

78 form_text,

79 extracted_forms,

80 word_entry,

81 raw_tags,

82 is_canonical=is_first_bold,

83 )

84 if node.kind == NodeKind.BOLD:

85 is_first_bold = False

86 raw_tags.clear()

87 clean_node(wxr, word_entry, expanded_nodes)

88 if len(raw_tags) > 0:

89 word_entry.raw_tags.extend(raw_tags)

90 translate_raw_tags(word_entry)

93def add_form_data(

94 node: WikiNode,

95 forms_text: str,

96 extracted_forms: dict[str, Form],

97 word_entry: WordEntry,

98 raw_tags: list[str],

99 is_canonical: bool = False,

100) -> None:

101 for form_text in re.split(r"・|、|,|•", forms_text):

102 form_text = form_text.strip()

103 if form_text in extracted_forms:

104 form = extracted_forms[form_text]

105 for raw_tag in raw_tags:

106 if raw_tag not in form.raw_tags: 106 ↛ 105line 106 didn't jump to line 105 because the condition on line 106 was always true

107 form.raw_tags.append(raw_tag)

108 translate_raw_tags(form)

109 continue

110 elif (

111 form_text == word_entry.word

112 or form_text.replace(" ", "") == word_entry.word

113 or len(form_text) == 0

114 ):

115 continue

116 form = Form(

117 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else []

118 )

119 extracted_forms[form_text] = form

120 if (

121 node.kind == NodeKind.BOLD

122 or (isinstance(node, HTMLNode) and node.tag == "strong")

123 ) and is_canonical:

124 form.tags.append("canonical")

125 is_canonical = False

126 if isinstance(node, HTMLNode):

127 class_names = node.attrs.get("class", "")

128 for class_name in FORM_OF_CLASS_TAGS:

129 if class_name in class_names:

130 form.tags.append(class_name)

131 if "tr Latn" in node.attrs.get("class", ""):

132 form.tags.append("transliteration")

133 translate_raw_tags(form)

134 if raw_tags == ["又は"] and len(word_entry.forms) > 0:

135 form.tags = word_entry.forms[-1].tags

136 form.raw_tags = word_entry.forms[-1].raw_tags

137 word_entry.forms.append(form)