Coverage for src/wiktextract/extractor/el/parse_utils.py: 38%

64 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2import unicodedata 

3from typing import Generator, TypeAlias 

4 

5from wikitextprocessor import WikiNode 

6 

7from wiktextract.page import clean_node 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from .models import Form, WordEntry 

11from .section_titles import ( 

12 POS_HEADINGS, 

13 POS_HEADINGS_RE, 

14 SUBSECTION_HEADINGS, 

15 SUBSECTIONS_RE, 

16 Heading, 

17 POSName, 

18 Tags, 

19) 

20from .text_utils import normalized_int 

21 

22# Ignorable templates that generate panels to the side, like 

23# Template:Wikipedia, or other meta-info like Template:see. 

24# Called 'panel templates' because they often generate panels. 

25PANEL_TEMPLATES: set[str] = set( 

26 [ 

27 "interwiktionary", 

28 "stub", 

29 "wik", 

30 "wikipedia", 

31 "Wikipedia", 

32 "wikispecies", 

33 "wikiquote", 

34 "Wikiquote", 

35 "improve", 

36 ] 

37) 

38 

39# Template name prefixes used for language-specific panel templates (i.e., 

40# templates that create side boxes or notice boxes or that should generally 

41# be ignored). 

42# PANEL_PREFIXES: set[str] = set() 

43 

44# Additional templates to be expanded in the pre-expand phase 

45# XXX nothing here yet, add as needed if some template turns out to be 

46# problematic when unexpanded. 

47ADDITIONAL_EXPAND_TEMPLATES: set[str] = set() 

48 

49# Names of templates used in etymology sections whose parameters we want 

50# to store in `etymology_templates`. 

51ETYMOLOGY_TEMPLATES: set[str] = set() 

52 

53GREEK_LANGCODES = set( 

54 ( 

55 "el", 

56 "grc", 

57 "el2", 

58 "el-crt", 

59 "el-cyp", 

60 "gkm", 

61 "gkm-cyp", 

62 "gkm-crt", 

63 "gmy", 

64 "gmy2", 

65 "grc-dor", 

66 "grc-ion", 

67 "grc-koi", 

68 "grk", 

69 "grk-ita", 

70 "grk-pro", 

71 "kath", 

72 "pnt", 

73 "pregrc", 

74 "tsd", 

75 "xme-old", 

76 "xmk", 

77 ) 

78) 

79 

80 

81Title: TypeAlias = str 

82 

83POSReturns: TypeAlias = list[ 

84 tuple[POSName, Title, Tags, int, WikiNode, WordEntry] 

85] 

86 

87 

88def find_sections( 

89 wxr: WiktextractContext, 

90 nodes: list[WikiNode], 

91) -> Generator[tuple[Heading, POSName, Title, Tags, int, WikiNode], None, None]: 

92 for node in nodes: 

93 heading_title = clean_node(wxr, None, node.largs[0]).lower().strip() 

94 

95 type, pos, heading_name, tags, num, ok = parse_lower_heading( 

96 wxr, heading_title 

97 ) 

98 

99 if num > 0: 

100 wxr.wtp.warning( 

101 f"Sub-sub-section is numbered: {heading_name}, {num=}", 

102 sortid="page/find_pos_sections_1", 

103 ) 

104 yield type, pos, heading_name, tags, num, node 

105 

106 

107def parse_lower_heading( 

108 wxr: WiktextractContext, heading: str 

109) -> tuple[Heading, str, str, Tags, int, bool]: 

110 """Determine if a heading is for a part of speech or other subsection. 

111 Returns heading type enum, POS name or string data, list of tags and a 

112 success bool.""" 

113 if m := POS_HEADINGS_RE.match(heading): 

114 pos, tags, num, ok = parse_pos_heading(wxr, heading, m) 

115 if ok: 

116 return Heading.POS, pos, heading, tags, num, True 

117 

118 if m := SUBSECTIONS_RE.match(heading): 

119 section, section_name, tags, num, ok = parse_section_heading( 

120 wxr, heading, m 

121 ) 

122 if ok: 

123 return section, section_name, heading, tags, num, True 

124 

125 return Heading.Err, "", heading, [], -1, False 

126 

127 

128def parse_pos_heading( 

129 wxr: WiktextractContext, heading: str, m: re.Match 

130) -> tuple[POSName, Tags, int, bool]: 

131 pos_str = m.group(1) 

132 rest = m.group(2) 

133 post_number = -1 

134 if rest: 

135 # logger.info(f"POS REST: '{rest}'") 

136 if rest.strip().isdigit(): 

137 post_number = normalized_int(rest.strip()) 

138 # logger.info(f"POST_NUMBER {post_number}") 

139 pos_data = POS_HEADINGS[pos_str] 

140 return pos_data["pos"], pos_data.get("tags", []), post_number, True 

141 

142 

143def parse_section_heading( 

144 wxr: WiktextractContext, heading: str, m: re.Match 

145) -> tuple[Heading, str, Tags, int, bool]: 

146 subsection_str = m.group(1) 

147 rest = m.group(2) 

148 post_number = -1 

149 if rest: 

150 # logger.info(f"SUBSECTION REST: '{rest}'") 

151 if rest.strip().isdigit(): 

152 post_number = normalized_int(rest.strip()) 

153 # logger.info(f"POST_NUMBER {post_number}") 

154 section_data = SUBSECTION_HEADINGS[subsection_str] 

155 return ( 

156 section_data["type"], 

157 subsection_str, 

158 section_data.get("tags", []), 

159 post_number, 

160 True, 

161 ) 

162 

163 

164# https://stackoverflow.com/a/518232 

165def strip_accents(accented: str) -> str: 

166 return "".join( 

167 c 

168 for c in unicodedata.normalize("NFD", accented) 

169 if unicodedata.category(c) != "Mn" 

170 ) 

171 

172 

173def remove_duplicate_forms( 

174 wxr: WiktextractContext, forms: list[Form] 

175) -> list[Form]: 

176 """Check for identical `forms` and remove duplicates.""" 

177 if not forms: 177 ↛ 178line 177 didn't jump to line 178 because the condition on line 177 was never true

178 return [] 

179 new_forms = [] 

180 for i, form in enumerate(forms): 

181 for comp in forms[i + 1 :]: 

182 if ( 182 ↛ 187line 182 didn't jump to line 187 because the condition on line 182 was never true

183 form.form == comp.form 

184 and form.tags == comp.tags 

185 and form.raw_tags == comp.raw_tags 

186 ): 

187 break 

188 # basically "continue" for the outer for block in this case, 

189 # but this will not trigger the following else-block 

190 else: 

191 # No duplicates found in for loop (exited without breaking) 

192 new_forms.append(form) 

193 if len(forms) > len(new_forms): 193 ↛ 195line 193 didn't jump to line 195 because the condition on line 193 was never true

194 # wxr.wtp.debug("Found duplicate forms", sortid="simple/pos/32") 

195 return new_forms 

196 return forms