Coverage for src/wiktextract/extractor/ja/linkage.py: 96%

80 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from mediawiki_langcodes import code_to_name, name_to_code 

2from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from ..ruby import extract_ruby 

7from .models import Descendant, Linkage, WordEntry 

8from .section_titles import LINKAGES 

9from .tags import translate_raw_tags 

10 

11 

12def extract_linkage_section( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16 linkage_type: str, 

17) -> None: 

18 if linkage_type in ["cognates", "descendants"]: 

19 extract_descendant_section(wxr, word_entry, level_node, linkage_type) 

20 return 

21 

22 sense = "" 

23 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

24 if isinstance(node, TemplateNode) and node.template_name.startswith( 

25 "rel-top" 

26 ): 

27 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

28 elif node.kind == NodeKind.LIST: 28 ↛ 23line 28 didn't jump to line 23 because the condition on line 28 was always true

29 for list_item in node.find_child_recursively(NodeKind.LIST_ITEM): 

30 linkage_type = process_linkage_list_item( 

31 wxr, word_entry, list_item, linkage_type, sense 

32 ) 

33 

34 

35def process_linkage_list_item( 

36 wxr: WiktextractContext, 

37 word_entry: WordEntry, 

38 list_item: WikiNode, 

39 linkage_type: str, 

40 sense: str, 

41) -> str: 

42 after_colon = False 

43 for node_idx, node in enumerate(list_item.children): 

44 if isinstance(node, str) and ":" in node and not after_colon: 

45 linkage_type_text = clean_node( 

46 wxr, None, list_item.children[:node_idx] 

47 ) 

48 linkage_type = LINKAGES.get(linkage_type_text, linkage_type) 

49 after_colon = True 

50 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

51 ("おくりがな", "ふりがな", "xlink") 

52 ): 

53 expanded_node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext(node), expand_all=True 

55 ) 

56 ruby, no_ruby = extract_ruby(wxr, expanded_node.children) 

57 if node.template_name == "xlink": 

58 ruby.clear() 

59 word = clean_node(wxr, None, no_ruby) 

60 if len(word) > 0: 60 ↛ 43line 60 didn't jump to line 43 because the condition on line 60 was always true

61 getattr(word_entry, linkage_type).append( 

62 Linkage(word=word, ruby=ruby, sense=sense) 

63 ) 

64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

65 word = clean_node(wxr, None, node) 

66 if len(word) > 0: 

67 getattr(word_entry, linkage_type).append( 

68 Linkage(word=word, sense=sense) 

69 ) 

70 elif isinstance(node, TemplateNode) and node.template_name == "sense": 

71 sense = clean_node(wxr, None, node).strip("(): ") 

72 

73 return linkage_type 

74 

75 

76def extract_descendant_section( 

77 wxr: WiktextractContext, 

78 word_entry: WordEntry, 

79 level_node: LevelNode, 

80 linkage_type: str, 

81) -> None: 

82 desc_list = [] 

83 for list_node in level_node.find_child(NodeKind.LIST): 

84 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

85 desc_list.extend(process_desc_list_item(wxr, list_item, [])) 

86 getattr(word_entry, linkage_type).extend(desc_list) 

87 

88 

89def process_desc_list_item( 

90 wxr: WiktextractContext, list_item: WikiNode, parent_list: list[Descendant] 

91) -> list[Descendant]: 

92 desc_list = [] 

93 lang_name = "" 

94 lang_code = "" 

95 for index, child in enumerate(list_item.children): 

96 if isinstance(child, str) and ":" in child: 

97 lang_name = clean_node(wxr, None, list_item.children[:index]) 

98 lang_code = name_to_code(lang_name, "ja") 

99 elif isinstance(child, TemplateNode) and child.template_name == "l": 

100 # https://ja.wiktionary.org/wiki/テンプレート:l 

101 l_args = { 

102 2: "word", 

103 3: "word", 

104 4: "sense", 

105 "gloss": "sense", 

106 "t": "sense", 

107 "tr": "roman", 

108 } 

109 if lang_code == "": 

110 lang_code = clean_node( 

111 wxr, None, child.template_parameters.get(1, "") 

112 ) 

113 if lang_name == "": 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 lang_name = code_to_name(lang_code, "ja") 

115 desc_data = Descendant(lang=lang_name, lang_code=lang_code) 

116 for arg_name, field in l_args.items(): 

117 arg_value = clean_node( 

118 wxr, None, child.template_parameters.get(arg_name, "") 

119 ) 

120 if arg_value != "": 

121 setattr(desc_data, field, arg_value) 

122 expanded_node = wxr.wtp.parse( 

123 wxr.wtp.node_to_wikitext(child), expand_all=True 

124 ) 

125 for span_tag in expanded_node.find_html( 

126 "span", attr_name="class", attr_value="gender" 

127 ): 

128 raw_tag = clean_node(wxr, None, span_tag) 

129 if raw_tag != "": 129 ↛ 125line 129 didn't jump to line 125 because the condition on line 129 was always true

130 desc_data.raw_tags.append(raw_tag) 

131 

132 if desc_data.word != "": 132 ↛ 95line 132 didn't jump to line 95 because the condition on line 132 was always true

133 translate_raw_tags(desc_data) 

134 desc_list.append(desc_data) 

135 elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

136 for next_list_item in child.find_child(NodeKind.LIST_ITEM): 

137 process_desc_list_item(wxr, next_list_item, desc_list) 

138 

139 for p_data in parent_list: 

140 p_data.descendants.extend(desc_list) 

141 return desc_list