Coverage for src/wiktextract/extractor/ja/header.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_OF_CLASS_TAGS = frozenset(["kanji", "plural"]) 

11 

12 

13def extract_header_nodes( 

14 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode] 

15) -> None: 

16 extracted_forms = set() 

17 use_nodes = [] 

18 is_first_bold = True 

19 for node in nodes: 

20 if isinstance(node, TemplateNode) and node.template_name in ( 

21 "jachar", 

22 "kochar", 

23 "vichar", 

24 "zhchar", 

25 ): 

26 is_first_bold = False 

27 else: 

28 use_nodes.append(node) 

29 expanded_nodes = wxr.wtp.parse( 

30 wxr.wtp.node_to_wikitext(use_nodes), expand_all=True 

31 ) 

32 raw_tags = [] 

33 for node in expanded_nodes.find_child_recursively( 

34 NodeKind.HTML | NodeKind.BOLD | NodeKind.ITALIC 

35 ): 

36 if isinstance(node, HTMLNode) and not ( 

37 node.tag in ["strong", "small"] 

38 or "headword" in node.attrs.get("class", "") 

39 or "form-of" in node.attrs.get("class", "") 

40 ): 

41 continue 

42 if isinstance(node, HTMLNode) and node.tag == "small": 

43 raw_tag = clean_node(wxr, None, node).strip("(): ") 

44 if raw_tag != "又は" and raw_tag not in raw_tags: 

45 # ignore "又は"(or) in "ja-noun" template 

46 raw_tags.append(raw_tag) 

47 else: 

48 form_text = clean_node(wxr, None, node).strip("()【】 ") 

49 add_form_data( 

50 node, 

51 form_text, 

52 extracted_forms, 

53 word_entry, 

54 raw_tags, 

55 is_canonical=is_first_bold, 

56 ) 

57 if node.kind == NodeKind.BOLD: 

58 is_first_bold = False 

59 raw_tags.clear() 

60 texts = clean_node(wxr, word_entry, expanded_nodes) 

61 for form_text in re.findall(r"[(【][^()【】]+[)】]", texts): 

62 add_form_data( 

63 expanded_nodes, 

64 form_text.strip("()【】 "), 

65 extracted_forms, 

66 word_entry, 

67 [], 

68 ) 

69 

70 

71def add_form_data( 

72 node: WikiNode, 

73 forms_text: str, 

74 extracted_forms: set[str], 

75 word_entry: WordEntry, 

76 raw_tags: list[str], 

77 is_canonical: bool = False, 

78) -> None: 

79 for form_text in re.split(r"・|、|,", forms_text): 

80 form_text = form_text.strip() 

81 if ( 

82 form_text == word_entry.word 

83 or form_text.replace(" ", "") == word_entry.word 

84 or len(form_text) == 0 

85 or form_text in extracted_forms 

86 ): 

87 continue 

88 extracted_forms.add(form_text) 

89 form = Form(form=form_text, raw_tags=raw_tags) 

90 if ( 

91 node.kind == NodeKind.BOLD 

92 or (isinstance(node, HTMLNode) and node.tag == "strong") 

93 ) and is_canonical: 

94 form.tags.append("canonical") 

95 is_canonical = False 

96 if isinstance(node, HTMLNode): 

97 class_names = node.attrs.get("class", "") 

98 for class_name in FORM_OF_CLASS_TAGS: 

99 if class_name in class_names: 

100 form.tags.append(class_name) 

101 translate_raw_tags(form) 

102 word_entry.forms.append(form)