Coverage for src/wiktextract/extractor/ja/header.py: 98%

77 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"]) 

11 

12 

13def extract_header_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode] 

15) -> None: 

16 extracted_forms = {} 

17 use_nodes = [] 

18 is_first_bold = True 

19 for node in nodes: 

20 if isinstance(node, TemplateNode) and node.template_name in ( 

21 "jachar", 

22 "kochar", 

23 "vichar", 

24 "zhchar", 

25 ): 

26 is_first_bold = False 

27 else: 

28 use_nodes.append(node) 

29 expanded_nodes = wxr.wtp.parse( 

30 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True 

31 ) 

32 raw_tags = [] 

33 for node in expanded_nodes.find_child_recursively( 

34 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC 

35 ): 

36 if ( 

37 isinstance(node, HTMLNode) 

38 and "gender" in node.attrs.get("class", "").split() 

39 ): 

40 raw_tag_text = clean_node(wxr, None, node) 

41 for raw_tag in re.split(r"\s|,", raw_tag_text): 

42 raw_tag = raw_tag.strip() 

43 if raw_tag != "": 43 ↛ 41line 43 didn't jump to line 41 because the condition on line 43 was always true

44 word_entry.raw_tags.append(raw_tag) 

45 if isinstance(node, HTMLNode) and not ( 

46 node.tag in ["strong", "small", "i", "b"] 

47 or "headword" in node.attrs.get("class", "") 

48 or "form-of" in node.attrs.get("class", "") 

49 ): 

50 continue 

51 if isinstance(node, HTMLNode) and node.tag in ["small", "i"]: 

52 raw_tag = clean_node(wxr, None, node).strip("(): ") 

53 if raw_tag != "又は" and raw_tag not in raw_tags: 

54 # ignore "又は"(or) in "ja-noun" template 

55 raw_tags.append(raw_tag) 

56 elif ( 

57 isinstance(node, HTMLNode) 

58 and node.tag == "span" 

59 and "form-of" in node.attrs.get("class", "") 

60 ): 

61 for span_child in node.children: 

62 if isinstance(span_child, str) and span_child.strip() != "": 

63 raw_tags.append(span_child.strip()) 

64 elif ( 

65 isinstance(span_child, WikiNode) 

66 and span_child.kind == NodeKind.BOLD 

67 ): 

68 word = clean_node(wxr, None, span_child) 

69 if word != "": 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true

70 add_form_data( 

71 node, word, extracted_forms, word_entry, raw_tags 

72 ) 

73 raw_tags.clear() 

74 else: 

75 form_text = clean_node(wxr, None, node).strip("()【】 ") 

76 add_form_data( 

77 node, 

78 form_text, 

79 extracted_forms, 

80 word_entry, 

81 raw_tags, 

82 is_canonical=is_first_bold, 

83 ) 

84 if node.kind == NodeKind.BOLD: 

85 is_first_bold = False 

86 raw_tags.clear() 

87 clean_node(wxr, word_entry, expanded_nodes) 

88 if len(raw_tags) > 0: 

89 word_entry.raw_tags.extend(raw_tags) 

90 translate_raw_tags(word_entry) 

91 

92 

93def add_form_data( 

94 node: WikiNode, 

95 forms_text: str, 

96 extracted_forms: dict[str, Form], 

97 word_entry: WordEntry, 

98 raw_tags: list[str], 

99 is_canonical: bool = False, 

100) -> None: 

101 for form_text in re.split(r"・|、|,|•", forms_text): 

102 form_text = form_text.strip() 

103 if form_text in extracted_forms: 

104 form = extracted_forms[form_text] 

105 for raw_tag in raw_tags: 

106 if raw_tag not in form.raw_tags: 106 ↛ 105line 106 didn't jump to line 105 because the condition on line 106 was always true

107 form.raw_tags.append(raw_tag) 

108 translate_raw_tags(form) 

109 continue 

110 elif ( 

111 form_text == word_entry.word 

112 or form_text.replace(" ", "") == word_entry.word 

113 or len(form_text) == 0 

114 ): 

115 continue 

116 form = Form( 

117 form=form_text, raw_tags=raw_tags if raw_tags != ["又は"] else [] 

118 ) 

119 extracted_forms[form_text] = form 

120 if ( 

121 node.kind == NodeKind.BOLD 

122 or (isinstance(node, HTMLNode) and node.tag == "strong") 

123 ) and is_canonical: 

124 form.tags.append("canonical") 

125 is_canonical = False 

126 if isinstance(node, HTMLNode): 

127 class_names = node.attrs.get("class", "") 

128 for class_name in FORM_OF_CLASS_TAGS: 

129 if class_name in class_names: 

130 form.tags.append(class_name) 

131 if "tr Latn" in node.attrs.get("class", ""): 

132 form.tags.append("transliteration") 

133 translate_raw_tags(form) 

134 if raw_tags == ["又は"] and len(word_entry.forms) > 0: 

135 form.tags = word_entry.forms[-1].tags 

136 form.raw_tags = word_entry.forms[-1].raw_tags 

137 word_entry.forms.append(form)