Coverage for src/wiktextract/extractor/ja/pos.py: 82%

99 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from ..ruby import extract_ruby 

12from .example import extract_example_list_item 

13from .header import extract_header_nodes 

14from .models import AltForm, Sense, WordEntry 

15from .section_titles import POS_DATA 

16from .tags import translate_raw_tags 

17 

18 

19def parse_pos_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24 pos_title: str, 

25) -> None: 

26 from .conjugation import extract_conjugation_section 

27 

28 page_data.append(base_data.model_copy(deep=True)) 

29 page_data[-1].pos_title = pos_title 

30 pos_data = POS_DATA[pos_title] 

31 page_data[-1].pos = pos_data["pos"] 

32 page_data[-1].tags.extend(pos_data.get("tags", [])) 

33 

34 gloss_list_start = 0 

35 for list_index, list_node in level_node.find_child(NodeKind.LIST, True): 

36 if not list_node.sarg.endswith("#"): # linkage list 

37 continue 

38 if gloss_list_start == 0: 38 ↛ 40line 38 didn't jump to line 40 because the condition on line 38 was always true

39 gloss_list_start = list_index 

40 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

41 process_gloss_list_item(wxr, page_data[-1], list_item) 

42 extract_header_nodes( 

43 wxr, page_data[-1], level_node.children[:gloss_list_start] 

44 ) 

45 old_forms_len = len(page_data[-1].forms) 

46 extract_conjugation_section(wxr, page_data[-1], level_node) 

47 if gloss_list_start == 0 and len(page_data[-1].forms) == old_forms_len: 

48 page_data.pop() 

49 

50 

51def process_gloss_list_item( 

52 wxr: WiktextractContext, 

53 word_entry: WordEntry, 

54 list_item_node: WikiNode, 

55 parent_gloss: str = "", 

56) -> None: 

57 gloss_nodes = list( 

58 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True) 

59 ) 

60 sense_data = Sense() 

61 find_form_of_data(wxr, word_entry, sense_data, list_item_node) 

62 if len(parent_gloss) > 0: 

63 sense_data.glosses.append(parent_gloss) 

64 gloss_only_nodes = [] 

65 for gloss_node in gloss_nodes: 

66 if isinstance(gloss_node, TemplateNode): 

67 if gloss_node.template_name in ("context", "タグ"): 

68 # https://ja.wiktionary.org/wiki/テンプレート:context 

69 # https://ja.wiktionary.org/wiki/テンプレート:タグ 

70 for raw_tag in ( 

71 clean_node(wxr, sense_data, gloss_node) 

72 .strip("()") 

73 .split(",") 

74 ): 

75 raw_tag = raw_tag.strip() 

76 if len(raw_tag) > 0: 76 ↛ 70line 76 didn't jump to line 70 because the condition on line 76 was always true

77 sense_data.raw_tags.append(raw_tag) 

78 elif gloss_node.template_name == "wikipedia-s": 

79 expanded_text = clean_node(wxr, None, gloss_node) 

80 gloss_only_nodes.append( 

81 expanded_text.removesuffix("⁽ʷᵖ⁾").strip() 

82 ) 

83 elif gloss_node.template_name == "wp": 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 continue 

85 elif gloss_node.template_name == "lb": 

86 extract_lb_template(wxr, sense_data, gloss_node) 

87 else: 

88 gloss_only_nodes.append(gloss_node) 

89 else: 

90 gloss_only_nodes.append(gloss_node) 

91 expanded_gloss = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True 

93 ) 

94 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children) 

95 gloss_text = clean_node(wxr, sense_data, no_ruby) 

96 sense_data.ruby = ruby 

97 if len(gloss_text) > 0: 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true

98 sense_data.glosses.append(gloss_text) 

99 translate_raw_tags(sense_data) 

100 word_entry.senses.append(sense_data) 

101 

102 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

103 if nest_gloss_list.sarg.endswith(("*", ":")): 

104 for example_list_item in nest_gloss_list.find_child( 

105 NodeKind.LIST_ITEM 

106 ): 

107 extract_example_list_item( 

108 wxr, word_entry, sense_data, example_list_item 

109 ) 

110 elif nest_gloss_list.sarg.endswith("#"): 110 ↛ 102line 110 didn't jump to line 102 because the condition on line 110 was always true

111 for nest_list_item in nest_gloss_list.find_child( 

112 NodeKind.LIST_ITEM 

113 ): 

114 process_gloss_list_item( 

115 wxr, word_entry, nest_list_item, gloss_text 

116 ) 

117 

118 

119def find_form_of_data( 

120 wxr: WiktextractContext, 

121 word_entry: WordEntry, 

122 sense: Sense, 

123 list_item_node: WikiNode, 

124) -> None: 

125 for node in list_item_node.find_child(NodeKind.TEMPLATE): 

126 if node.template_name.endswith(" of"): 

127 expanded_node = wxr.wtp.parse( 

128 wxr.wtp.node_to_wikitext(node), expand_all=True 

129 ) 

130 for link_node in expanded_node.find_child_recursively( 130 ↛ 125line 130 didn't jump to line 125 because the loop on line 130 didn't complete

131 NodeKind.LINK 

132 ): 

133 form_of = clean_node(wxr, None, link_node) 

134 if form_of != "": 134 ↛ 130line 134 didn't jump to line 130 because the condition on line 134 was always true

135 sense.form_of.append(AltForm(word=form_of)) 

136 break 

137 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

138 for link_node in list_item_node.find_child(NodeKind.LINK): 138 ↛ exitline 138 didn't return from function 'find_form_of_data' because the loop on line 138 didn't complete

139 form_of = clean_node(wxr, None, link_node) 

140 if form_of != "": 140 ↛ 138line 140 didn't jump to line 138 because the condition on line 140 was always true

141 sense.form_of.append(AltForm(word=form_of)) 

142 break 

143 

144 

145def extract_note_section( 

146 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

147) -> None: 

148 has_list = False 

149 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

150 has_list = True 

151 note = clean_node(wxr, word_entry, list_item.children) 

152 if note != "": 

153 word_entry.notes.append(note) 

154 if not has_list: 

155 note = clean_node( 

156 wxr, 

157 word_entry, 

158 list(level_node.invert_find_child(LEVEL_KIND_FLAGS)), 

159 ) 

160 if note != "": 

161 word_entry.notes.append(note) 

162 

163 

164def extract_lb_template( 

165 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

166) -> None: 

167 text = clean_node(wxr, sense, t_node).strip("() ") 

168 for raw_tag in text.split(","): 

169 raw_tag = raw_tag.strip() 

170 if raw_tag != "": 170 ↛ 168line 170 didn't jump to line 168 because the condition on line 170 was always true

171 sense.raw_tags.append(raw_tag)