Coverage for src/wiktextract/extractor/el/parse

1import re

2import unicodedata

3from typing import Generator, TypeAlias

5from wikitextprocessor import WikiNode

7from wiktextract.page import clean_node

8from wiktextract.wxr_context import WiktextractContext

10from .models import Form, WordEntry

11from .section_titles import (

12 POS_HEADINGS,

13 POS_HEADINGS_RE,

14 SUBSECTION_HEADINGS,

15 SUBSECTIONS_RE,

16 Heading,

17 POSName,

18 Tags,

19)

20from .text_utils import normalized_int

22# Ignorable templates that generate panels to the side, like

23# Template:Wikipedia, or other meta-info like Template:see.

24# Called 'panel templates' because they often generate panels.

25PANEL_TEMPLATES: set[str] = set(

26 [

27 "interwiktionary",

28 "stub",

29 "wik",

30 "wikipedia",

31 "Wikipedia",

32 "wikispecies",

33 "wikiquote",

34 "Wikiquote",

35 "improve",

36 ]

37)

39# Template name prefixes used for language-specific panel templates (i.e.,

40# templates that create side boxes or notice boxes or that should generally

41# be ignored).

42# PANEL_PREFIXES: set[str] = set()

44# Additional templates to be expanded in the pre-expand phase

45# XXX nothing here yet, add as needed if some template turns out to be

46# problematic when unexpanded.

47ADDITIONAL_EXPAND_TEMPLATES: set[str] = set()

49# Names of templates used in etymology sections whose parameters we want

50# to store in `etymology_templates`.

51ETYMOLOGY_TEMPLATES: set[str] = set()

53GREEK_LANGCODES = set(

54 (

55 "el",

56 "grc",

57 "el2",

58 "el-crt",

59 "el-cyp",

60 "gkm",

61 "gkm-cyp",

62 "gkm-crt",

63 "gmy",

64 "gmy2",

65 "grc-dor",

66 "grc-ion",

67 "grc-koi",

68 "grk",

69 "grk-ita",

70 "grk-pro",

71 "kath",

72 "pnt",

73 "pregrc",

74 "tsd",

75 "xme-old",

76 "xmk",

77 )

78)

81Title: TypeAlias = str

83POSReturns: TypeAlias = list[

84 tuple[POSName, Title, Tags, int, WikiNode, WordEntry]

85]

88def find_sections(

89 wxr: WiktextractContext,

90 nodes: list[WikiNode],

91) -> Generator[tuple[Heading, POSName, Title, Tags, int, WikiNode], None, None]:

92 for node in nodes:

93 heading_title = clean_node(wxr, None, node.largs[0]).lower().strip()

95 type, pos, heading_name, tags, num, ok = parse_lower_heading(

96 wxr, heading_title

97 )

99 if num > 0:

100 wxr.wtp.warning(

101 f"Sub-sub-section is numbered: {heading_name}, {num=}",

102 sortid="page/find_pos_sections_1",

103 )

104 yield type, pos, heading_name, tags, num, node

105

106

107def parse_lower_heading(

108 wxr: WiktextractContext, heading: str

109) -> tuple[Heading, str, str, Tags, int, bool]:

110 """Determine if a heading is for a part of speech or other subsection.

111 Returns heading type enum, POS name or string data, list of tags and a

112 success bool."""

113 if m := POS_HEADINGS_RE.match(heading):

114 pos, tags, num, ok = parse_pos_heading(wxr, heading, m)

115 if ok:

116 return Heading.POS, pos, heading, tags, num, True

117

118 if m := SUBSECTIONS_RE.match(heading):

119 section, section_name, tags, num, ok = parse_section_heading(

120 wxr, heading, m

121 )

122 if ok:

123 return section, section_name, heading, tags, num, True

124

125 return Heading.Err, "", heading, [], -1, False

126

127

128def parse_pos_heading(

129 wxr: WiktextractContext, heading: str, m: re.Match

130) -> tuple[POSName, Tags, int, bool]:

131 pos_str = m.group(1)

132 rest = m.group(2)

133 post_number = -1

134 if rest:

135 # logger.info(f"POS REST: '{rest}'")

136 if rest.strip().isdigit():

137 post_number = normalized_int(rest.strip())

138 # logger.info(f"POST_NUMBER {post_number}")

139 pos_data = POS_HEADINGS[pos_str]

140 return pos_data["pos"], pos_data.get("tags", []), post_number, True

141

142

143def parse_section_heading(

144 wxr: WiktextractContext, heading: str, m: re.Match

145) -> tuple[Heading, str, Tags, int, bool]:

146 subsection_str = m.group(1)

147 rest = m.group(2)

148 post_number = -1

149 if rest:

150 # logger.info(f"SUBSECTION REST: '{rest}'")

151 if rest.strip().isdigit():

152 post_number = normalized_int(rest.strip())

153 # logger.info(f"POST_NUMBER {post_number}")

154 section_data = SUBSECTION_HEADINGS[subsection_str]

155 return (

156 section_data["type"],

157 subsection_str,

158 section_data.get("tags", []),

159 post_number,

160 True,

161 )

162

163

164# https://stackoverflow.com/a/518232

165def strip_accents(accented: str) -> str:

166 return "".join(

167 c

168 for c in unicodedata.normalize("NFD", accented)

169 if unicodedata.category(c) != "Mn"

170 )

171

172

173def remove_duplicate_forms(

174 wxr: WiktextractContext, forms: list[Form]

175) -> list[Form]:

176 """Check for identical `forms` and remove duplicates."""

177 if not forms: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 return []

179 new_forms = []

180 for i, form in enumerate(forms):

181 for comp in forms[i + 1 :]:

182 if ( 182 ↛ 187line 182 didn't jump to line 187 because the condition on line 182 was never true

183 form.form == comp.form

184 and form.tags == comp.tags

185 and form.raw_tags == comp.raw_tags

186 ):

187 break

188 # basically "continue" for the outer for block in this case,

189 # but this will not trigger the following else-block

190 else:

191 # No duplicates found in for loop (exited without breaking)

192 new_forms.append(form)

193 if len(forms) > len(new_forms): 193 ↛ 195line 193 didn't jump to line 195 because the condition on line 193 was never true

194 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32")

195 return new_forms

196 return forms

Coverage for src/wiktextract/extractor/el/parse_utils.py: 38%

64 statements