Coverage for src / wiktextract / extractor / pl / page.py: 81%

99 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import itertools 

2import re 

3from typing import Any 

4 

5from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .example import extract_example_section 

11from .form import FORM_SECTIONS, extract_form_section 

12from .inflection import extract_inflection_section 

13from .linkage import LINKAGE_TYPES, extract_linkage_section 

14from .models import Sense, WordEntry 

15from .note import extract_note_section 

16from .pos import extract_pos_section 

17from .sound import extract_morphology_section, extract_sound_section 

18from .translation import extract_translation_section 

19 

20 

21def parse_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26) -> None: 

27 # title templates 

28 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł 

29 title_text = clean_node(wxr, None, level_node.largs) 

30 wxr.wtp.start_subsection(title_text) 

31 if title_text == "wymowa" and wxr.config.capture_pronunciation: 

32 extract_sound_section(wxr, base_data, level_node) 

33 elif title_text == "znaczenia": 

34 extract_pos_section(wxr, page_data, base_data, level_node) 

35 elif title_text == "przykłady": 

36 extract_example_section(wxr, page_data, base_data, level_node) 

37 elif title_text == "etymologia" and wxr.config.capture_etymologies: 

38 extract_etymology_section(wxr, page_data, base_data, level_node) 

39 elif title_text == "tłumaczenia" and wxr.config.capture_translations: 

40 extract_translation_section( 

41 wxr, page_data, level_node, base_data.lang_code 

42 ) 

43 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_linkage_section( 

45 wxr, 

46 page_data, 

47 level_node, 

48 LINKAGE_TYPES[title_text], 

49 base_data.lang_code, 

50 ) 

51 elif title_text in ["uwagi", "składnia"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_note_section(wxr, page_data, base_data, level_node) 

53 elif title_text == "odmiana" and wxr.config.capture_inflections: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 extract_inflection_section( 

55 wxr, page_data, base_data.lang_code, level_node 

56 ) 

57 elif title_text in FORM_SECTIONS: 57 ↛ 61line 57 didn't jump to line 61 because the condition on line 57 was always true

58 extract_form_section( 

59 wxr, page_data, base_data, level_node, FORM_SECTIONS[title_text] 

60 ) 

61 elif title_text == "morfologia": 

62 extract_morphology_section(wxr, base_data, level_node) 

63 elif title_text not in ["źródła", "klucz"]: 

64 wxr.wtp.debug( 

65 f"Unknown section: {title_text}", 

66 sortid="extractor/pl/page/parse_section/63", 

67 ) 

68 

69 

70def parse_page( 

71 wxr: WiktextractContext, page_title: str, page_text: str 

72) -> list[dict[str, Any]]: 

73 # page layout 

74 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł 

75 wxr.wtp.start_page(page_title) 

76 tree = wxr.wtp.parse(page_text, pre_expand=True) 

77 page_data: list[WordEntry] = [] 

78 for level2_node in tree.find_child(NodeKind.LEVEL2): 

79 after_parenthesis = False 

80 lang_code = "unknown" 

81 lang_name = "unknown" 

82 lang_title_cats = {} 

83 for title_content_node in itertools.chain.from_iterable( 83 ↛ 103line 83 didn't jump to line 103 because the loop on line 83 didn't complete

84 level2_node.largs 

85 ): 

86 if isinstance( 

87 title_content_node, str 

88 ) and title_content_node.strip().endswith("("): 

89 after_parenthesis = True 

90 elif ( 

91 isinstance(title_content_node, TemplateNode) 

92 and after_parenthesis 

93 ): 

94 expanded_template = wxr.wtp.parse( 

95 wxr.wtp.node_to_wikitext(title_content_node), 

96 expand_all=True, 

97 ) 

98 for span_tag in expanded_template.find_html("span"): 

99 lang_code = span_tag.attrs.get("id", "") 

100 break 

101 lang_name = clean_node(wxr, lang_title_cats, expanded_template) 

102 break 

103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true

104 wxr.config.capture_language_codes is not None 

105 and lang_code not in wxr.config.capture_language_codes 

106 ): 

107 continue 

108 wxr.wtp.start_section(lang_name) 

109 base_data = WordEntry( 

110 word=wxr.wtp.title, 

111 lang_code=lang_code, 

112 lang=lang_name, 

113 pos="unknown", 

114 categories=lang_title_cats.get("categories", []), 

115 ) 

116 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

117 parse_section(wxr, page_data, base_data, level3_node) 

118 

119 for data in page_data: 

120 if len(data.senses) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 data.senses.append(Sense(tags=["no-gloss"])) 

122 new_sounds = [] 

123 for sound in data.sounds: 

124 if sound.sense_index == "" or match_sense_index( 

125 sound.sense_index, data 

126 ): 

127 new_sounds.append(sound) 

128 data.sounds = new_sounds 

129 return [m.model_dump(exclude_defaults=True) for m in page_data] 

130 

131 

132def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool: 

133 # return `True` if `WordEntry` has a `Sense` with same POS section 

134 # index number, usually the first number before "." 

135 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0: 

136 return False 

137 if hasattr(word_entry, "senses"): 

138 sense = word_entry.senses[0] 

139 elif isinstance(word_entry, Sense): 139 ↛ 148line 139 didn't jump to line 148 because the condition on line 139 was always true

140 sense = word_entry 

141 # find exact match for index like "1.1" 

142 exact_match = not ( 

143 "," in sense_index or "-" in sense_index or "." not in sense_index 

144 ) 

145 if exact_match: 145 ↛ 148line 145 didn't jump to line 148 because the condition on line 145 was always true

146 return sense_index == sense.sense_index 

147 

148 pos_index_str = sense.sense_index[: sense_index.find(".")] 

149 pos_section_index = 0 

150 if pos_index_str.isdigit(): 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 pos_section_index = int(pos_index_str) 

152 else: 

153 return False 

154 

155 for part_of_index in sense_index.split(","): 

156 part_of_index = part_of_index.strip() 

157 if ( 

158 "." in part_of_index 

159 and pos_index_str == part_of_index[: part_of_index.find(".")] 

160 ): 

161 return True 

162 elif re.fullmatch(r"\d+-\d+", part_of_index): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true

163 start_str, end_str = part_of_index.split("-") 

164 if int(start_str) <= pos_section_index and pos_section_index <= int( 

165 end_str 

166 ): 

167 return True 

168 

169 return False