Coverage for src / wiktextract / extractor / ja / header.py: 97%

81 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"]) 

11 

12 

13def extract_header_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode] 

15) -> None: 

16 extracted_forms = {} 

17 use_nodes = [] 

18 is_first_bold = True 

19 for node in nodes: 

20 if isinstance(node, TemplateNode) and node.template_name in ( 

21 "jachar", 

22 "kochar", 

23 "vichar", 

24 "zhchar", 

25 ): 

26 is_first_bold = False 

27 elif isinstance(node, TemplateNode): 

28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true

29 use_nodes.append(node) 

30 # ignore other templates, like "wikipedia" and "commonscat" 

31 else: 

32 use_nodes.append(node) 

33 expanded_nodes = wxr.wtp.parse( 

34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True 

35 ) 

36 raw_tags = [] 

37 for node in expanded_nodes.find_child_recursively( 

38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC 

39 ): 

40 if ( 

41 isinstance(node, HTMLNode) 

42 and "gender" in node.attrs.get("class", "").split() 

43 ): 

44 raw_tag_text = clean_node(wxr, None, node) 

45 for raw_tag in re.split(r"\s|,", raw_tag_text): 

46 raw_tag = raw_tag.strip() 

47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true

48 word_entry.raw_tags.append(raw_tag) 

49 if isinstance(node, HTMLNode) and not ( 

50 node.tag in ["strong", "small", "i", "b"] 

51 or "headword" in node.attrs.get("class", "") 

52 or "form-of" in node.attrs.get("class", "") 

53 ): 

54 continue 

55 if (isinstance(node, HTMLNode) and node.tag in ["small", "i"]) or ( 

56 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC 

57 ): 

58 raw_tag = clean_node(wxr, None, node).strip("(): ") 

59 if raw_tag != "又は" and raw_tag not in raw_tags: 

60 # ignore "又は"(or) in "ja-noun" template 

61 raw_tags.append(raw_tag) 

62 elif ( 

63 isinstance(node, HTMLNode) 

64 and node.tag == "span" 

65 and "form-of" in node.attrs.get("class", "").split() 

66 ): 

67 for span_child in node.children: 

68 if isinstance(span_child, str) and span_child.strip() != "": 

69 raw_tags.append(span_child.strip()) 

70 elif ( 

71 isinstance(span_child, WikiNode) 

72 and span_child.kind == NodeKind.BOLD 

73 ): 

74 word = clean_node(wxr, None, span_child) 

75 if word != "": 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true

76 add_form_data( 

77 node, word, extracted_forms, word_entry, raw_tags 

78 ) 

79 raw_tags.clear() 

80 else: 

81 form_text = clean_node(wxr, None, node).strip("()【】 ") 

82 add_form_data( 

83 node, 

84 form_text, 

85 extracted_forms, 

86 word_entry, 

87 raw_tags, 

88 is_canonical=is_first_bold, 

89 ) 

90 if node.kind == NodeKind.BOLD: 

91 is_first_bold = False 

92 raw_tags.clear() 

93 clean_node(wxr, word_entry, expanded_nodes) 

94 if len(raw_tags) > 0: 

95 word_entry.raw_tags.extend(raw_tags) 

96 translate_raw_tags(word_entry) 

97 

98 

99def add_form_data( 

100 node: WikiNode, 

101 forms_text: str, 

102 extracted_forms: dict[str, Form], 

103 word_entry: WordEntry, 

104 raw_tags: list[str], 

105 is_canonical: bool = False, 

106) -> None: 

107 for form_text in re.split(r"・|、|,|•", forms_text): 

108 form_text = form_text.strip() 

109 if form_text in extracted_forms: 

110 form = extracted_forms[form_text] 

111 for raw_tag in raw_tags: 

112 if raw_tag not in form.raw_tags: 112 ↛ 111line 112 didn't jump to line 111 because the condition on line 112 was always true

113 form.raw_tags.append(raw_tag) 

114 translate_raw_tags(form) 

115 continue 

116 elif ( 

117 form_text == word_entry.word 

118 or form_text.replace(" ", "") == word_entry.word 

119 or len(form_text) == 0 

120 ): 

121 continue 

122 form = Form( 

123 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else [] 

124 ) 

125 extracted_forms[form_text] = form 

126 if ( 

127 node.kind == NodeKind.BOLD 

128 or (isinstance(node, HTMLNode) and node.tag == "strong") 

129 ) and is_canonical: 

130 form.tags.append("canonical") 

131 is_canonical = False 

132 if isinstance(node, HTMLNode): 

133 class_names = node.attrs.get("class", "") 

134 for class_name in FORM_OF_CLASS_TAGS: 

135 if class_name in class_names: 

136 form.tags.append(class_name) 

137 class_name = node.attrs.get("class", "") 

138 if "tr Latn" in class_name or "headword-tr" in class_name: 

139 form.tags.append("transliteration") 

140 translate_raw_tags(form) 

141 if raw_tags == ["又は"] and len(word_entry.forms) > 0: 

142 form.tags = word_entry.forms[-1].tags 

143 form.raw_tags = word_entry.forms[-1].raw_tags 

144 word_entry.forms.append(form)