Coverage for src/wiktextract/extractor/it/pos.py: 77%

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from ..share import calculate_bold_offsets

6from .example import extract_example_list_item

7from .models import AltForm, Sense, WordEntry

8from .section_titles import POS_DATA

9from .tag_form_line import extract_tag_form_line_nodes

10from .tags import translate_raw_tags

12POS_SUBSECTION_TEMPLATES = frozenset(

13 [

14 # https://it.wiktionary.org/wiki/Categoria:Template_per_i_verbi

15 "-participio passato-",

16 "-participio presente-",

17 "Ausiliare",

18 "Deponente",

19 "Intransitivo",

20 "Medio",

21 "Passivo",

22 "Reciproco",

23 "Riflessivo",

24 "riflessivo",

25 "Transitivo",

26 # https://it.wiktionary.org/wiki/Categoria:Template_vocabolo

27 "Attivo",

28 "attivo",

29 "Inpr",

30 "inpr",

31 "Riflpr",

32 ]

33)

36def add_new_pos_data(

37 wxr: WiktextractContext,

38 page_data: list[WordEntry],

39 base_data: WordEntry,

40 level_node: LevelNode,

41 pos_title: str,

42) -> None:

43 page_data.append(base_data.model_copy(deep=True))

44 page_data[-1].pos_title = pos_title

45 if pos_title.startswith("Trascrizione"): 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 pos_title = "Trascrizione"

47 pos_data = POS_DATA[pos_title]

48 page_data[-1].pos = pos_data["pos"]

49 page_data[-1].tags.extend(pos_data.get("tags", []))

50 for link_node in level_node.find_child(NodeKind.LINK):

51 clean_node(wxr, page_data[-1], link_node)

54def extract_pos_section(

55 wxr: WiktextractContext,

56 page_data: list[WordEntry],

57 base_data: WordEntry,

58 level_node: LevelNode,

59 pos_title: str,

60) -> None:

61 add_new_pos_data(wxr, page_data, base_data, level_node, pos_title)

62 last_gloss_list_index = 0

63 for index, node in enumerate(level_node.children):

64 if (

65 isinstance(node, WikiNode)

66 and node.kind == NodeKind.LIST

67 and node.sarg.startswith("#")

68 and node.sarg.endswith("#")

69 ):

70 for list_item in node.find_child(NodeKind.LIST_ITEM):

71 extract_gloss_list_item(wxr, page_data[-1], list_item)

72 extract_tag_form_line_nodes(

73 wxr,

74 page_data[-1],

75 level_node.children[last_gloss_list_index:index],

76 )

77 last_gloss_list_index = index + 1

78 elif (

79 isinstance(node, TemplateNode)

80 and node.template_name in POS_SUBSECTION_TEMPLATES

81 ):

82 if len(page_data[-1].senses) > 0:

83 add_new_pos_data(

84 wxr, page_data, base_data, level_node, pos_title

85 )

86 raw_tag = clean_node(wxr, page_data[-1], node).strip("= \n")

87 page_data[-1].raw_tags.append(raw_tag)

88 translate_raw_tags(page_data[-1])

91def extract_gloss_list_item(

92 wxr: WiktextractContext,

93 word_entry: WordEntry,

94 list_item: WikiNode,

95 parent_sense: Sense | None = None,

96) -> None:

97 gloss_nodes = []

98 sense = (

99 Sense() if parent_sense is None else parent_sense.model_copy(deep=True)

100 )

101 for node in list_item.children:

102 if isinstance(node, TemplateNode):

103 t_str = clean_node(wxr, sense, node)

104 if t_str.startswith("(") and t_str.endswith(")"):

105 sense.raw_tags.append(t_str.strip("()"))

106 else:

107 gloss_nodes.append(t_str)

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

109 if (

110 node.sarg.endswith(":")

111 and len(sense.examples) > 0

112 and sense.examples[-1].translation == ""

113 ):

114 for tr_list_item in node.find_child(NodeKind.LIST_ITEM):

115 sense.examples[-1].translation = clean_node(

116 wxr, sense, tr_list_item.children

117 )

118 calculate_bold_offsets(

119 wxr,

120 tr_list_item,

121 sense.examples[-1].translation,

122 sense.examples[-1],

123 "bold_translation_offsets",

124 )

125 elif node.sarg.endswith(("*", ":")):

126 for example_list_item in node.find_child(NodeKind.LIST_ITEM):

127 extract_example_list_item(

128 wxr, sense, example_list_item, word_entry.lang_code

129 )

130 else:

131 gloss_nodes.append(node)

132 gloss_str = clean_node(wxr, sense, gloss_nodes)

133 if gloss_str != "": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true

134 sense.glosses.append(gloss_str)

135 translate_raw_tags(sense)

136 if "form-of" in word_entry.tags:

137 extract_form_of_word(wxr, sense, list_item)

138 word_entry.senses.append(sense)

139

140 for list_node in list_item.find_child(NodeKind.LIST):

141 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):

142 for child_list_item in list_node.find_child(NodeKind.LIST_ITEM):

143 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)

144

145

146def extract_form_of_word(

147 wxr: WiktextractContext,

148 sense: Sense,

149 list_item: WikiNode,

150) -> None:

151 word = ""

152 for node in list_item.find_child(NodeKind.LINK):

153 word = clean_node(wxr, None, node)

154 if word != "": 154 ↛ exitline 154 didn't return from function 'extract_form_of_word' because the condition on line 154 was always true

155 sense.form_of.append(AltForm(word=word))

156

157

158def extract_note_section(

159 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode

160) -> None:

161 notes = []

162 has_list = False

163 for list_node in level_node.find_child(NodeKind.LIST):

164 has_list = True

165 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

166 note = clean_node(wxr, None, list_item.children)

167 if note != "":

168 notes.append(note)

169 if not has_list:

170 note = clean_node(wxr, None, level_node.children)

171 if note != "":

172 notes.append(note)

173

174 for data in page_data:

175 if data.lang_code == page_data[-1].lang_code:

176 data.notes.extend(notes)