Coverage for src/wiktextract/extractor/ja/header.py: 97%

80 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"]) 

11 

12 

13def extract_header_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode] 

15) -> None: 

16 extracted_forms = {} 

17 use_nodes = [] 

18 is_first_bold = True 

19 for node in nodes: 

20 if isinstance(node, TemplateNode) and node.template_name in ( 

21 "jachar", 

22 "kochar", 

23 "vichar", 

24 "zhchar", 

25 ): 

26 is_first_bold = False 

27 elif isinstance(node, TemplateNode): 

28 if node.template_name.startswith(word_entry.lang_code + "-"): 28 ↛ 19line 28 didn't jump to line 19 because the condition on line 28 was always true

29 use_nodes.append(node) 

30 # ignore other templates, like "wikipedia" and "commonscat" 

31 else: 

32 use_nodes.append(node) 

33 expanded_nodes = wxr.wtp.parse( 

34 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True 

35 ) 

36 raw_tags = [] 

37 for node in expanded_nodes.find_child_recursively( 

38 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC 

39 ): 

40 if ( 

41 isinstance(node, HTMLNode) 

42 and "gender" in node.attrs.get("class", "").split() 

43 ): 

44 raw_tag_text = clean_node(wxr, None, node) 

45 for raw_tag in re.split(r"\s|,", raw_tag_text): 

46 raw_tag = raw_tag.strip() 

47 if raw_tag != "": 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true

48 word_entry.raw_tags.append(raw_tag) 

49 if isinstance(node, HTMLNode) and not ( 

50 node.tag in ["strong", "small", "i", "b"] 

51 or "headword" in node.attrs.get("class", "") 

52 or "form-of" in node.attrs.get("class", "") 

53 ): 

54 continue 

55 if isinstance(node, HTMLNode) and node.tag in ["small", "i"]: 

56 raw_tag = clean_node(wxr, None, node).strip("(): ") 

57 if raw_tag != "又は" and raw_tag not in raw_tags: 

58 # ignore "又は"(or) in "ja-noun" template 

59 raw_tags.append(raw_tag) 

60 elif ( 

61 isinstance(node, HTMLNode) 

62 and node.tag == "span" 

63 and "form-of" in node.attrs.get("class", "") 

64 ): 

65 for span_child in node.children: 

66 if isinstance(span_child, str) and span_child.strip() != "": 

67 raw_tags.append(span_child.strip()) 

68 elif ( 

69 isinstance(span_child, WikiNode) 

70 and span_child.kind == NodeKind.BOLD 

71 ): 

72 word = clean_node(wxr, None, span_child) 

73 if word != "": 73 ↛ 77line 73 didn't jump to line 77 because the condition on line 73 was always true

74 add_form_data( 

75 node, word, extracted_forms, word_entry, raw_tags 

76 ) 

77 raw_tags.clear() 

78 else: 

79 form_text = clean_node(wxr, None, node).strip("()【】 ") 

80 add_form_data( 

81 node, 

82 form_text, 

83 extracted_forms, 

84 word_entry, 

85 raw_tags, 

86 is_canonical=is_first_bold, 

87 ) 

88 if node.kind == NodeKind.BOLD: 

89 is_first_bold = False 

90 raw_tags.clear() 

91 clean_node(wxr, word_entry, expanded_nodes) 

92 if len(raw_tags) > 0: 

93 word_entry.raw_tags.extend(raw_tags) 

94 translate_raw_tags(word_entry) 

95 

96 

97def add_form_data( 

98 node: WikiNode, 

99 forms_text: str, 

100 extracted_forms: dict[str, Form], 

101 word_entry: WordEntry, 

102 raw_tags: list[str], 

103 is_canonical: bool = False, 

104) -> None: 

105 for form_text in re.split(r"・|、|,|•", forms_text): 

106 form_text = form_text.strip() 

107 if form_text in extracted_forms: 

108 form = extracted_forms[form_text] 

109 for raw_tag in raw_tags: 

110 if raw_tag not in form.raw_tags: 110 ↛ 109line 110 didn't jump to line 109 because the condition on line 110 was always true

111 form.raw_tags.append(raw_tag) 

112 translate_raw_tags(form) 

113 continue 

114 elif ( 

115 form_text == word_entry.word 

116 or form_text.replace(" ", "") == word_entry.word 

117 or len(form_text) == 0 

118 ): 

119 continue 

120 form = Form( 

121 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else [] 

122 ) 

123 extracted_forms[form_text] = form 

124 if ( 

125 node.kind == NodeKind.BOLD 

126 or (isinstance(node, HTMLNode) and node.tag == "strong") 

127 ) and is_canonical: 

128 form.tags.append("canonical") 

129 is_canonical = False 

130 if isinstance(node, HTMLNode): 

131 class_names = node.attrs.get("class", "") 

132 for class_name in FORM_OF_CLASS_TAGS: 

133 if class_name in class_names: 

134 form.tags.append(class_name) 

135 if "tr Latn" in node.attrs.get("class", ""): 

136 form.tags.append("transliteration") 

137 translate_raw_tags(form) 

138 if raw_tags == ["又は"] and len(word_entry.forms) > 0: 

139 form.tags = word_entry.forms[-1].tags 

140 form.raw_tags = word_entry.forms[-1].raw_tags 

141 word_entry.forms.append(form)