Coverage for src/wiktextract/extractor/pl/page.py: 72%

107 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import itertools 

2import re 

3from typing import Any 

4 

5from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .example import extract_example_section 

11from .inflection import extract_inflection_section 

12from .linkage import LINKAGE_TYPES, extract_linkage_section 

13from .models import Form, Sense, WordEntry 

14from .note import extract_note_section 

15from .pos import extract_pos_section 

16from .sound import extract_sound_section 

17from .translation import extract_translation_section 

18 

19 

20def parse_section( 

21 wxr: WiktextractContext, 

22 page_data: list[WordEntry], 

23 base_data: WordEntry, 

24 level_node: WikiNode, 

25) -> None: 

26 # title templates 

27 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł 

28 title_text = clean_node(wxr, None, level_node.largs) 

29 wxr.wtp.start_subsection(title_text) 

30 if title_text == "wymowa" and wxr.config.capture_pronunciation: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 extract_sound_section(wxr, base_data, level_node) 

32 elif title_text == "znaczenia": 

33 extract_pos_section(wxr, page_data, base_data, level_node) 

34 elif title_text == "przykłady": 

35 extract_example_section(wxr, page_data, base_data, level_node) 

36 elif title_text == "etymologia" and wxr.config.capture_etymologies: 

37 extract_etymology_section(wxr, page_data, base_data, level_node) 

38 elif title_text == "tłumaczenia" and wxr.config.capture_translations: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true

39 extract_translation_section( 

40 wxr, page_data, level_node, base_data.lang_code 

41 ) 

42 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: 

43 extract_linkage_section( 

44 wxr, 

45 page_data, 

46 level_node, 

47 LINKAGE_TYPES[title_text], 

48 base_data.lang_code, 

49 ) 

50 elif title_text == "uwagi": 

51 extract_note_section(wxr, page_data, base_data, level_node) 

52 elif title_text == "odmiana" and wxr.config.capture_inflections: 

53 extract_inflection_section( 

54 wxr, page_data, base_data.lang_code, level_node 

55 ) 

56 elif title_text == "zapis": 

57 extract_zapis_section(wxr, base_data, level_node) 

58 elif title_text == "transliteracja": 

59 extract_transliteracja_section(wxr, base_data, level_node) 

60 

61 

62def extract_zapis_section( 

63 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

64) -> None: 

65 # get around "preformatted" node 

66 for node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

67 if node.template_name == "ptrad": 67 ↛ 66line 67 didn't jump to line 66 because the condition on line 67 was always true

68 form_text = clean_node( 

69 wxr, None, node.template_parameters.get(1, "") 

70 ) 

71 if form_text != "": 71 ↛ 66line 71 didn't jump to line 66 because the condition on line 71 was always true

72 base_data.forms.append( 

73 Form(form=form_text, tags=["Traditional Chinese"]) 

74 ) 

75 

76 

77def extract_transliteracja_section( 

78 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

79) -> None: 

80 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

81 for node in list_item.children: 

82 if isinstance(node, str): 82 ↛ 81line 82 didn't jump to line 81 because the condition on line 82 was always true

83 m = re.search(r"\([\d\s,-.]+\)", node) 

84 if m is not None: 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true

85 sense_index = m.group(0).strip("()") 

86 roman = node[m.end() :].strip() 

87 if roman != "": 87 ↛ 81line 87 didn't jump to line 81 because the condition on line 87 was always true

88 base_data.forms.append( 

89 Form( 

90 form=roman, 

91 sense_index=sense_index, 

92 tags=["romanization"], 

93 ) 

94 ) 

95 

96 

97def parse_page( 

98 wxr: WiktextractContext, page_title: str, page_text: str 

99) -> list[dict[str, Any]]: 

100 # page layout 

101 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł 

102 wxr.wtp.start_page(page_title) 

103 tree = wxr.wtp.parse(page_text, pre_expand=True) 

104 page_data: list[WordEntry] = [] 

105 for level2_node in tree.find_child(NodeKind.LEVEL2): 

106 after_parenthesis = False 

107 lang_code = "unknown" 

108 lang_name = "unknown" 

109 lang_title_cats = {} 

110 for title_content_node in itertools.chain.from_iterable( 110 ↛ 130line 110 didn't jump to line 130 because the loop on line 110 didn't complete

111 level2_node.largs 

112 ): 

113 if isinstance( 

114 title_content_node, str 

115 ) and title_content_node.strip().endswith("("): 

116 after_parenthesis = True 

117 elif ( 

118 isinstance(title_content_node, TemplateNode) 

119 and after_parenthesis 

120 ): 

121 expanded_template = wxr.wtp.parse( 

122 wxr.wtp.node_to_wikitext(title_content_node), 

123 expand_all=True, 

124 ) 

125 for span_tag in expanded_template.find_html("span"): 

126 lang_code = span_tag.attrs.get("id", "") 

127 break 

128 lang_name = clean_node(wxr, lang_title_cats, expanded_template) 

129 break 

130 if ( 130 ↛ 134line 130 didn't jump to line 134

131 wxr.config.capture_language_codes is not None 

132 and lang_code not in wxr.config.capture_language_codes 

133 ): 

134 continue 

135 wxr.wtp.start_section(lang_name) 

136 base_data = WordEntry( 

137 word=wxr.wtp.title, 

138 lang_code=lang_code, 

139 lang=lang_name, 

140 pos="unknown", 

141 categories=lang_title_cats.get("categories", []), 

142 ) 

143 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

144 parse_section(wxr, page_data, base_data, level3_node) 

145 

146 for data in page_data: 

147 if len(data.senses) == 0: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 data.senses.append(Sense(tags=["no-gloss"])) 

149 return [m.model_dump(exclude_defaults=True) for m in page_data] 

150 

151 

152def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool: 

153 # return `True` if `WordEntry` has a `Sense` with same POS section 

154 # index number, usually the first number before "." 

155 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0: 

156 return False 

157 if hasattr(word_entry, "senses"): 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true

158 sense = word_entry.senses[0] 

159 elif isinstance(word_entry, Sense): 

160 sense = word_entry 

161 # find exact match for index like "1.1" 

162 exact_match = not ( 

163 "," in sense_index or "-" in sense_index or "." not in sense_index 

164 ) 

165 if exact_match: 

166 return sense_index == sense.sense_index 

167 

168 pos_index_str = sense.sense_index[: sense_index.find(".")] 

169 pos_section_index = 0 

170 if pos_index_str.isdigit(): 170 ↛ 173line 170 didn't jump to line 173 because the condition on line 170 was always true

171 pos_section_index = int(pos_index_str) 

172 else: 

173 return False 

174 

175 for part_of_index in sense_index.split(","): 

176 part_of_index = part_of_index.strip() 

177 if ( 

178 "." in part_of_index 

179 and pos_index_str == part_of_index[: part_of_index.find(".")] 

180 ): 

181 return True 

182 elif re.fullmatch(r"\d+-\d+", part_of_index): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 start_str, end_str = part_of_index.split("-") 

184 if int(start_str) <= pos_section_index and pos_section_index <= int( 

185 end_str 

186 ): 

187 return True 

188 

189 return False