Coverage for src/wiktextract/extractor/it/pos.py: 77%

86 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import calculate_bold_offsets 

6from .example import extract_example_list_item 

7from .models import AltForm, Sense, WordEntry 

8from .section_titles import POS_DATA 

9from .tag_form_line import extract_tag_form_line_nodes 

10from .tags import translate_raw_tags 

11 

12POS_SUBSECTION_TEMPLATES = frozenset( 

13 [ 

14 # https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi 

15 "-participio passato-", 

16 "-participio presente-", 

17 "Ausiliare", 

18 "Deponente", 

19 "Intransitivo", 

20 "Medio", 

21 "Passivo", 

22 "Reciproco", 

23 "Riflessivo", 

24 "riflessivo", 

25 "Transitivo", 

26 # https://it.wiktionary.org/wiki/Categoria:Template_vocabolo 

27 "Attivo", 

28 "attivo", 

29 "Inpr", 

30 "inpr", 

31 "Riflpr", 

32 ] 

33) 

34 

35 

36def add_new_pos_data( 

37 wxr: WiktextractContext, 

38 page_data: list[WordEntry], 

39 base_data: WordEntry, 

40 level_node: LevelNode, 

41 pos_title: str, 

42) -> None: 

43 page_data.append(base_data.model_copy(deep=True)) 

44 page_data[-1].pos_title = pos_title 

45 if pos_title.startswith("Trascrizione"): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 pos_title = "Trascrizione" 

47 pos_data = POS_DATA[pos_title] 

48 page_data[-1].pos = pos_data["pos"] 

49 page_data[-1].tags.extend(pos_data.get("tags", [])) 

50 for link_node in level_node.find_child(NodeKind.LINK): 

51 clean_node(wxr, page_data[-1], link_node) 

52 

53 

54def extract_pos_section( 

55 wxr: WiktextractContext, 

56 page_data: list[WordEntry], 

57 base_data: WordEntry, 

58 level_node: LevelNode, 

59 pos_title: str, 

60) -> None: 

61 add_new_pos_data(wxr, page_data, base_data, level_node, pos_title) 

62 last_gloss_list_index = 0 

63 for index, node in enumerate(level_node.children): 

64 if ( 

65 isinstance(node, WikiNode) 

66 and node.kind == NodeKind.LIST 

67 and node.sarg.startswith("#") 

68 and node.sarg.endswith("#") 

69 ): 

70 for list_item in node.find_child(NodeKind.LIST_ITEM): 

71 extract_gloss_list_item(wxr, page_data[-1], list_item) 

72 extract_tag_form_line_nodes( 

73 wxr, 

74 page_data[-1], 

75 level_node.children[last_gloss_list_index:index], 

76 ) 

77 last_gloss_list_index = index + 1 

78 elif ( 

79 isinstance(node, TemplateNode) 

80 and node.template_name in POS_SUBSECTION_TEMPLATES 

81 ): 

82 if len(page_data[-1].senses) > 0: 

83 add_new_pos_data( 

84 wxr, page_data, base_data, level_node, pos_title 

85 ) 

86 raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n") 

87 page_data[-1].raw_tags.append(raw_tag) 

88 translate_raw_tags(page_data[-1]) 

89 

90 

91def extract_gloss_list_item( 

92 wxr: WiktextractContext, 

93 word_entry: WordEntry, 

94 list_item: WikiNode, 

95 parent_sense: Sense | None = None, 

96) -> None: 

97 gloss_nodes = [] 

98 sense = ( 

99 Sense() if parent_sense is None else parent_sense.model_copy(deep=True) 

100 ) 

101 for node in list_item.children: 

102 if isinstance(node, TemplateNode): 

103 t_str = clean_node(wxr, sense, node) 

104 if t_str.startswith("(") and t_str.endswith(")"): 

105 sense.raw_tags.append(t_str.strip("()")) 

106 else: 

107 gloss_nodes.append(t_str) 

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

109 if ( 

110 node.sarg.endswith(":") 

111 and len(sense.examples) > 0 

112 and sense.examples[-1].translation == "" 

113 ): 

114 for tr_list_item in node.find_child(NodeKind.LIST_ITEM): 

115 sense.examples[-1].translation = clean_node( 

116 wxr, sense, tr_list_item.children 

117 ) 

118 calculate_bold_offsets( 

119 wxr, 

120 tr_list_item, 

121 sense.examples[-1].translation, 

122 sense.examples[-1], 

123 "bold_translation_offsets", 

124 ) 

125 elif node.sarg.endswith(("*", ":")): 

126 for example_list_item in node.find_child(NodeKind.LIST_ITEM): 

127 extract_example_list_item( 

128 wxr, sense, example_list_item, word_entry.lang_code 

129 ) 

130 else: 

131 gloss_nodes.append(node) 

132 gloss_str = clean_node(wxr, sense, gloss_nodes) 

133 if gloss_str != "": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true

134 sense.glosses.append(gloss_str) 

135 translate_raw_tags(sense) 

136 if "form-of" in word_entry.tags: 

137 extract_form_of_word(wxr, sense, list_item) 

138 word_entry.senses.append(sense) 

139 

140 for list_node in list_item.find_child(NodeKind.LIST): 

141 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 

142 for child_list_item in list_node.find_child(NodeKind.LIST_ITEM): 

143 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

144 

145 

146def extract_form_of_word( 

147 wxr: WiktextractContext, 

148 sense: Sense, 

149 list_item: WikiNode, 

150) -> None: 

151 word = "" 

152 for node in list_item.find_child(NodeKind.LINK): 

153 word = clean_node(wxr, None, node) 

154 if word != "": 154 ↛ exitline 154 didn't return from function 'extract_form_of_word' because the condition on line 154 was always true

155 sense.form_of.append(AltForm(word=word)) 

156 

157 

158def extract_note_section( 

159 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

160) -> None: 

161 notes = [] 

162 has_list = False 

163 for list_node in level_node.find_child(NodeKind.LIST): 

164 has_list = True 

165 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

166 note = clean_node(wxr, None, list_item.children) 

167 if note != "": 

168 notes.append(note) 

169 if not has_list: 

170 note = clean_node(wxr, None, level_node.children) 

171 if note != "": 

172 notes.append(note) 

173 

174 for data in page_data: 

175 if data.lang_code == page_data[-1].lang_code: 

176 data.notes.extend(notes)