Coverage for src / wiktextract / extractor / ja / header.py: 96%

87 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-02 00:27 +0000

1import re 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Classifier, Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"]) 

11 

12 

13def extract_header_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode] 

15) -> None: 

16 extracted_forms = {} 

17 use_nodes = [] 

18 is_first_bold = True 

19 for node in nodes: 

20 if isinstance(node, TemplateNode) and node.template_name in ( 

21 "jachar", 

22 "kochar", 

23 "vichar", 

24 "zhchar", 

25 ): 

26 is_first_bold = False 

27 elif isinstance(node, TemplateNode): 

28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true

29 use_nodes.append(node) 

30 # ignore other templates, like "wikipedia" and "commonscat" 

31 else: 

32 use_nodes.append(node) 

33 expanded_nodes = wxr.wtp.parse( 

34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True 

35 ) 

36 raw_tags = [] 

37 for node in expanded_nodes.find_child_recursively( 

38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC 

39 ): 

40 if ( 

41 isinstance(node, HTMLNode) 

42 and "gender" in node.attrs.get("class", "").split() 

43 ): 

44 raw_tag_text = clean_node(wxr, None, node) 

45 for raw_tag in re.split(r"\s|,", raw_tag_text): 

46 raw_tag = raw_tag.strip() 

47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true

48 word_entry.raw_tags.append(raw_tag) 

49 if isinstance(node, HTMLNode) and not ( 

50 node.tag in ["strong", "small", "i", "b"] 

51 or "headword" in node.attrs.get("class", "") 

52 or "form-of" in node.attrs.get("class", "") 

53 ): 

54 continue 

55 if (isinstance(node, HTMLNode) and node.tag in ["small", "i"]) or ( 

56 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC 

57 ): 

58 raw_tag = clean_node(wxr, None, node).strip("(): ") 

59 if raw_tag not in raw_tags: 

60 raw_tags.append(raw_tag) 

61 elif ( 

62 isinstance(node, HTMLNode) 

63 and node.tag == "span" 

64 and "form-of" in node.attrs.get("class", "").split() 

65 ): 

66 for span_child in node.children: 

67 if isinstance(span_child, str) and span_child.strip() != "": 

68 raw_tags.append(span_child.strip()) 

69 elif ( 

70 isinstance(span_child, WikiNode) 

71 and span_child.kind == NodeKind.BOLD 

72 ): 

73 word = clean_node(wxr, None, span_child) 

74 if word != "": 74 ↛ 78line 74 didn't jump to line 78 because the condition on line 74 was always true

75 add_form_data( 

76 node, word, extracted_forms, word_entry, raw_tags 

77 ) 

78 raw_tags.clear() 

79 else: 

80 form_text = clean_node(wxr, None, node).strip("()【】 ") 

81 add_form_data( 

82 node, 

83 form_text, 

84 extracted_forms, 

85 word_entry, 

86 raw_tags, 

87 is_canonical=is_first_bold, 

88 ) 

89 if node.kind == NodeKind.BOLD: 

90 is_first_bold = False 

91 raw_tags.clear() 

92 new_forms = [] 

93 for form in word_entry.forms: 

94 if "類別詞" in form.raw_tags: 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true

95 word_entry.classifiers.append( 

96 Classifier( 

97 classifier=form.form, tags=form.tags, raw_tags=form.raw_tags 

98 ) 

99 ) 

100 else: 

101 new_forms.append(form) 

102 word_entry.forms = new_forms 

103 clean_node(wxr, word_entry, expanded_nodes) 

104 if len(raw_tags) > 0: 

105 word_entry.raw_tags.extend(raw_tags) 

106 translate_raw_tags(word_entry) 

107 

108 

109def add_form_data( 

110 node: WikiNode, 

111 forms_text: str, 

112 extracted_forms: dict[str, Form], 

113 word_entry: WordEntry, 

114 raw_tags: list[str], 

115 is_canonical: bool = False, 

116) -> None: 

117 for form_text in re.split(r"・|、|,|•", forms_text): 

118 form_text = form_text.strip() 

119 if form_text in extracted_forms: 

120 form = extracted_forms[form_text] 

121 for raw_tag in raw_tags: 

122 if raw_tag not in form.raw_tags: 122 ↛ 121line 122 didn't jump to line 121 because the condition on line 122 was always true

123 form.raw_tags.append(raw_tag) 

124 translate_raw_tags(form) 

125 continue 

126 elif ( 

127 form_text == word_entry.word 

128 or form_text.replace(" ", "") == word_entry.word 

129 or len(form_text) == 0 

130 ): 

131 continue 

132 form = Form( 

133 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else [] 

134 ) 

135 extracted_forms[form_text] = form 

136 if ( 

137 node.kind == NodeKind.BOLD 

138 or (isinstance(node, HTMLNode) and node.tag == "strong") 

139 ) and is_canonical: 

140 form.tags.append("canonical") 

141 is_canonical = False 

142 if isinstance(node, HTMLNode): 

143 class_names = node.attrs.get("class", "") 

144 for class_name in FORM_OF_CLASS_TAGS: 

145 if class_name in class_names: 

146 form.tags.append(class_name) 

147 class_name = node.attrs.get("class", "") 

148 if "tr Latn" in class_name or "headword-tr" in class_name: 

149 form.tags.append("transliteration") 

150 translate_raw_tags(form) 

151 if raw_tags == ["又は"] and len(word_entry.forms) > 0: 

152 form.tags = word_entry.forms[-1].tags 

153 form.raw_tags = word_entry.forms[-1].raw_tags 

154 word_entry.forms.append(form)