Coverage for src / wiktextract / extractor / ja / pos.py: 83%

103 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-26 08:59 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from ..ruby import extract_ruby 

12from .example import extract_example_list_item 

13from .header import extract_header_nodes 

14from .models import AltForm, Sense, WordEntry 

15from .section_titles import POS_DATA 

16from .tags import translate_raw_tags 

17 

18 

19def parse_pos_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24 pos_title: str, 

25) -> None: 

26 from .conjugation import extract_conjugation_section 

27 

28 page_data.append(base_data.model_copy(deep=True)) 

29 page_data[-1].pos_title = pos_title 

30 pos_data = POS_DATA[pos_title] 

31 page_data[-1].pos = pos_data["pos"] 

32 page_data[-1].tags.extend(pos_data.get("tags", [])) 

33 

34 gloss_list_start = 0 

35 for list_index, list_node in level_node.find_child(NodeKind.LIST, True): 

36 if not list_node.sarg.endswith("#"): # linkage list 

37 continue 

38 if gloss_list_start == 0: 38 ↛ 35line 38 didn't jump to line 35 because the condition on line 38 was always true

39 gloss_list_start = list_index 

40 extract_header_nodes( 

41 wxr, page_data[-1], level_node.children[:gloss_list_start] 

42 ) 

43 for list_index, list_node in level_node.find_child(NodeKind.LIST, True): 

44 if not list_node.sarg.endswith("#"): # linkage list 

45 continue 

46 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

47 process_gloss_list_item(wxr, page_data[-1], list_item) 

48 old_forms_len = len(page_data[-1].forms) 

49 extract_conjugation_section(wxr, page_data[-1], level_node) 

50 if gloss_list_start == 0 and len(page_data[-1].forms) == old_forms_len: 

51 page_data.pop() 

52 

53 

54def process_gloss_list_item( 

55 wxr: WiktextractContext, 

56 word_entry: WordEntry, 

57 list_item_node: WikiNode, 

58 parent_gloss: str = "", 

59) -> None: 

60 gloss_nodes = list( 

61 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True) 

62 ) 

63 sense_data = Sense() 

64 find_form_of_data(wxr, word_entry, sense_data, list_item_node) 

65 if len(parent_gloss) > 0: 

66 sense_data.glosses.append(parent_gloss) 

67 gloss_only_nodes = [] 

68 for gloss_node in gloss_nodes: 

69 if isinstance(gloss_node, TemplateNode): 

70 if gloss_node.template_name in ("context", "タグ"): 

71 # https://ja.wiktionary.org/wiki/テンプレート:context 

72 # https://ja.wiktionary.org/wiki/テンプレート:タグ 

73 for raw_tag in ( 

74 clean_node(wxr, sense_data, gloss_node) 

75 .strip("()") 

76 .split(",") 

77 ): 

78 raw_tag = raw_tag.strip() 

79 if len(raw_tag) > 0: 79 ↛ 73line 79 didn't jump to line 73 because the condition on line 79 was always true

80 sense_data.raw_tags.append(raw_tag) 

81 elif gloss_node.template_name == "wikipedia-s": 

82 expanded_text = clean_node(wxr, None, gloss_node) 

83 gloss_only_nodes.append( 

84 expanded_text.removesuffix("⁽ʷᵖ⁾").strip() 

85 ) 

86 elif gloss_node.template_name == "wp": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 continue 

88 elif gloss_node.template_name == "lb": 

89 extract_lb_template(wxr, sense_data, gloss_node) 

90 else: 

91 gloss_only_nodes.append(gloss_node) 

92 else: 

93 gloss_only_nodes.append(gloss_node) 

94 expanded_gloss = wxr.wtp.parse( 

95 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True 

96 ) 

97 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children) 

98 gloss_text = clean_node(wxr, sense_data, no_ruby) 

99 sense_data.ruby = ruby 

100 if len(gloss_text) > 0: 100 ↛ 105line 100 didn't jump to line 105 because the condition on line 100 was always true

101 sense_data.glosses.append(gloss_text) 

102 translate_raw_tags(sense_data) 

103 word_entry.senses.append(sense_data) 

104 

105 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

106 if nest_gloss_list.sarg.endswith(("*", ":")): 

107 for example_list_item in nest_gloss_list.find_child( 

108 NodeKind.LIST_ITEM 

109 ): 

110 extract_example_list_item( 

111 wxr, word_entry, sense_data, example_list_item 

112 ) 

113 elif nest_gloss_list.sarg.endswith("#"): 113 ↛ 105line 113 didn't jump to line 105 because the condition on line 113 was always true

114 for nest_list_item in nest_gloss_list.find_child( 

115 NodeKind.LIST_ITEM 

116 ): 

117 process_gloss_list_item( 

118 wxr, word_entry, nest_list_item, gloss_text 

119 ) 

120 

121 

122def find_form_of_data( 

123 wxr: WiktextractContext, 

124 word_entry: WordEntry, 

125 sense: Sense, 

126 list_item_node: WikiNode, 

127) -> None: 

128 for node in list_item_node.find_child(NodeKind.TEMPLATE): 

129 if node.template_name.endswith(" of"): 

130 expanded_node = wxr.wtp.parse( 

131 wxr.wtp.node_to_wikitext(node), expand_all=True 

132 ) 

133 for link_node in expanded_node.find_child_recursively( 133 ↛ 128line 133 didn't jump to line 128 because the loop on line 133 didn't complete

134 NodeKind.LINK 

135 ): 

136 form_of = clean_node(wxr, None, link_node) 

137 if form_of != "": 137 ↛ 133line 137 didn't jump to line 133 because the condition on line 137 was always true

138 sense.form_of.append(AltForm(word=form_of)) 

139 break 

140 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

141 for link_node in list_item_node.find_child(NodeKind.LINK): 141 ↛ exitline 141 didn't return from function 'find_form_of_data' because the loop on line 141 didn't complete

142 form_of = clean_node(wxr, None, link_node) 

143 if form_of != "": 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true

144 sense.form_of.append(AltForm(word=form_of)) 

145 sense.tags.append("form-of") 

146 break 

147 

148 

149def extract_note_section( 

150 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

151) -> None: 

152 has_list = False 

153 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

154 has_list = True 

155 note = clean_node(wxr, word_entry, list_item.children) 

156 if note != "": 

157 word_entry.notes.append(note) 

158 if not has_list: 

159 note = clean_node( 

160 wxr, 

161 word_entry, 

162 list( 

163 level_node.invert_find_child( 

164 LEVEL_KIND_FLAGS, include_empty_str=True 

165 ) 

166 ), 

167 ) 

168 if note != "": 

169 word_entry.notes.append(note) 

170 

171 

172def extract_lb_template( 

173 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

174) -> None: 

175 text = clean_node(wxr, sense, t_node).strip("() ") 

176 for raw_tag in text.split(","): 

177 raw_tag = raw_tag.strip() 

178 if raw_tag != "": 178 ↛ 176line 178 didn't jump to line 176 because the condition on line 178 was always true

179 sense.raw_tags.append(raw_tag)