Coverage for src/wiktextract/extractor/es/page.py: 78%

1from wikitextprocessor.parser import (

2 LEVEL_KIND_FLAGS,

3 NodeKind,

4 TemplateNode,

5 WikiNode,

8from ...page import clean_node

9from ...wxr_context import WiktextractContext

10from ...wxr_logging import logger

11from .conjugation import extract_conjugation_section

12from .etymology import extract_etymology_section

13from .linkage import (

14 extract_additional_information_section,

15 extract_alt_form_section,

16 extract_linkage_section,

17)

18from .models import Sense, WordEntry

19from .pos import extract_pos_section

20from .pronunciation import process_pron_graf_template

21from .section_titles import (

22 IGNORED_TITLES,

23 LINKAGE_TITLES,

24 POS_TITLES,

25 TRANSLATIONS_TITLES,

26)

27from .translation import extract_translation_section

30def parse_section(

31 wxr: WiktextractContext,

32 page_data: list[WordEntry],

33 base_data: WordEntry,

34 level_node: WikiNode,

35) -> None:

36 """

37 Parses indidividual sibling sections of an entry,

38 e.g. https://es.wiktionary.org/wiki/amor:

40 === Etimología ===

41 === {{sustantivo masculino|es}} ===

42 === Locuciones ===

43 """

45 categories = {}

46 section_title = clean_node(wxr, categories, level_node.largs)

47 original_section_title = section_title

48 section_title = section_title.lower()

49 wxr.wtp.start_subsection(original_section_title)

50 if section_title == "": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 return None

53 pos_template_name = ""

54 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):

55 pos_template_name = level_node_template.template_name

56 break

58 pos_keys = [

59 section_title,

60 pos_template_name,

61 " ".join(section_title.split()[:2]),

62 section_title.split()[0],

63 ]

64 if section_title in IGNORED_TITLES: 64 ↛ 65line 64 didn't jump to line 65 because the condition on line 64 was never true

65 pass

66 elif any(key in POS_TITLES for key in pos_keys):

67 pos_data = None

68 for key in pos_keys: 68 ↛ 72line 68 didn't jump to line 72 because the loop on line 68 didn't complete

69 pos_data = POS_TITLES.get(key)

70 if pos_data is not None:

71 break

72 if pos_data is not None: 72 ↛ 137line 72 didn't jump to line 137 because the condition on line 72 was always true

73 pos_type = pos_data["pos"]

74 page_data.append(base_data.model_copy(deep=True))

75 page_data[-1].pos = pos_type

76 page_data[-1].pos_title = original_section_title

77 page_data[-1].tags.extend(pos_data.get("tags", []))

78 page_data[-1].categories.extend(categories.get("categories", []))

79 extract_pos_section(wxr, page_data[-1], level_node, section_title)

80 if len(page_data[-1].senses) == 0:

81 if "form-of" in page_data[-1].tags:

82 page_data.pop()

83 elif section_title in LINKAGE_TITLES: 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 page_data.pop()

85 extract_linkage_section(

86 wxr,

87 page_data,

88 level_node,

89 LINKAGE_TITLES[section_title],

90 )

91 elif (

92 section_title.startswith("etimología")

93 and wxr.config.capture_etymologies

94 ):

95 if level_node.contain_node(LEVEL_KIND_FLAGS):

96 base_data = base_data.model_copy(deep=True)

97 extract_etymology_section(wxr, base_data, level_node)

98 elif (

99 section_title in TRANSLATIONS_TITLES and wxr.config.capture_translations

100 ):

101 if len(page_data) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 page_data.append(base_data.model_copy(deep=True))

103 extract_translation_section(wxr, page_data, level_node)

104 elif section_title == "descendientes": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 if len(page_data) == 0:

106 page_data.append(base_data.model_copy(deep=True))

107 extract_translation_section(wxr, page_data, level_node, False)

108 elif ( 108 ↛ 112line 108 didn't jump to line 112 because the condition on line 108 was never true

109 section_title in LINKAGE_TITLES

110 or section_title.removesuffix("s") in LINKAGE_TITLES

111 ):

112 if section_title not in LINKAGE_TITLES:

113 section_title = section_title.removesuffix("s")

114 if len(page_data) == 0:

115 page_data.append(base_data.model_copy(deep=True))

116 extract_linkage_section(

117 wxr, page_data, level_node, LINKAGE_TITLES[section_title]

118 )

119 elif section_title == "conjugación":

120 if len(page_data) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 page_data.append(base_data.model_copy(deep=True))

122 extract_conjugation_section(wxr, page_data, level_node)

123 elif section_title == "formas alternativas":

124 extract_alt_form_section(

125 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

126 )

127 elif section_title == "información adicional": 127 ↛ 132line 127 didn't jump to line 132 because the condition on line 127 was always true

128 extract_additional_information_section(

129 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node

130 )

131 else:

132 wxr.wtp.debug(

133 f"Unprocessed section: {section_title}",

134 sortid="extractor/es/page/parse_section/48",

135 )

136

137 for link_node in level_node.find_child(NodeKind.LINK):

138 clean_node(

139 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node

140 )

141

142 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

143 parse_section(wxr, page_data, base_data, next_level_node)

144

145

146def parse_page(

147 wxr: WiktextractContext, page_title: str, page_text: str

148) -> list[dict[str, any]]:

149 # style guide

150 # https://es.wiktionary.org/wiki/Wikcionario:Guía_de_estilo

151 # entry layout

152 # https://es.wiktionary.org/wiki/Wikcionario:Estructura

153 if wxr.config.verbose: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 logger.info(f"Parsing page: {page_title}")

155 wxr.wtp.start_page(page_title)

156 tree = wxr.wtp.parse(page_text)

157 page_data: list[WordEntry] = []

158 for level2_node in tree.find_child(NodeKind.LEVEL2):

159 categories = {}

160 lang_code = "unknown"

161 lang_name = "unknown"

162 section_title = clean_node(wxr, None, level2_node.largs)

163 if section_title.lower() == "referencias y notas": 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue

165 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 165 ↛ 172line 165 didn't jump to line 172 because the loop on line 165 didn't complete

166 # https://es.wiktionary.org/wiki/Plantilla:lengua

167 # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma

168 if subtitle_template.template_name == "lengua": 168 ↛ 165line 168 didn't jump to line 165 because the condition on line 168 was always true

169 lang_code = subtitle_template.template_parameters.get(1).lower()

170 lang_name = clean_node(wxr, categories, subtitle_template)

171 break

172 if ( 172 ↛ 176line 172 didn't jump to line 176 because the condition on line 172 was never true

173 wxr.config.capture_language_codes is not None

174 and lang_code not in wxr.config.capture_language_codes

175 ):

176 continue

177 wxr.wtp.start_section(lang_name)

178 base_data = WordEntry(

179 lang=lang_name,

180 lang_code=lang_code,

181 word=page_title,

182 pos="unknown",

183 categories=categories.get("categories", []),

184 )

185 for node in level2_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):

186 if (

187 isinstance(node, TemplateNode)

188 and node.template_name == "pron-graf"

189 ):

190 process_pron_graf_template(wxr, base_data, node)

191 elif node.kind == NodeKind.LINK: 191 ↛ 185line 191 didn't jump to line 185 because the condition on line 191 was always true

192 clean_node(wxr, base_data, node)

193

194 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):

195 parse_section(wxr, page_data, base_data, next_level_node)

196

197 for data in page_data:

198 if len(data.senses) == 0:

199 data.senses.append(Sense(tags=["no-gloss"]))

200 return [d.model_dump(exclude_defaults=True) for d in page_data]