Coverage for src/wiktextract/extractor/pl/page.py: 79%

94 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import itertools 

2import re 

3from typing import Any 

4 

5from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .etymology import extract_etymology_section 

10from .example import extract_example_section 

11from .form import FORM_SECTIONS, extract_form_section 

12from .inflection import extract_inflection_section 

13from .linkage import LINKAGE_TYPES, extract_linkage_section 

14from .models import Sense, WordEntry 

15from .note import extract_note_section 

16from .pos import extract_pos_section 

17from .sound import extract_morphology_section, extract_sound_section 

18from .translation import extract_translation_section 

19 

20 

21def parse_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: LevelNode, 

26) -> None: 

27 # title templates 

28 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł 

29 title_text = clean_node(wxr, None, level_node.largs) 

30 wxr.wtp.start_subsection(title_text) 

31 if title_text == "wymowa" and wxr.config.capture_pronunciation: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true

32 extract_sound_section(wxr, base_data, level_node) 

33 elif title_text == "znaczenia": 

34 extract_pos_section(wxr, page_data, base_data, level_node) 

35 elif title_text == "przykłady": 

36 extract_example_section(wxr, page_data, base_data, level_node) 

37 elif title_text == "etymologia" and wxr.config.capture_etymologies: 

38 extract_etymology_section(wxr, page_data, base_data, level_node) 

39 elif title_text == "tłumaczenia" and wxr.config.capture_translations: 

40 extract_translation_section( 

41 wxr, page_data, level_node, base_data.lang_code 

42 ) 

43 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_linkage_section( 

45 wxr, 

46 page_data, 

47 level_node, 

48 LINKAGE_TYPES[title_text], 

49 base_data.lang_code, 

50 ) 

51 elif title_text in ["uwagi", "składnia"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_note_section(wxr, page_data, base_data, level_node) 

53 elif title_text == "odmiana" and wxr.config.capture_inflections: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 extract_inflection_section( 

55 wxr, page_data, base_data.lang_code, level_node 

56 ) 

57 elif title_text in FORM_SECTIONS: 57 ↛ 61line 57 didn't jump to line 61 because the condition on line 57 was always true

58 extract_form_section( 

59 wxr, page_data, base_data, level_node, FORM_SECTIONS[title_text] 

60 ) 

61 elif title_text == "morfologia": 

62 extract_morphology_section(wxr, base_data, level_node) 

63 elif title_text not in ["źródła", "klucz"]: 

64 wxr.wtp.debug( 

65 f"Unknown section: {title_text}", 

66 sortid="extractor/pl/page/parse_section/63", 

67 ) 

68 

69 

70def parse_page( 

71 wxr: WiktextractContext, page_title: str, page_text: str 

72) -> list[dict[str, Any]]: 

73 # page layout 

74 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł 

75 wxr.wtp.start_page(page_title) 

76 tree = wxr.wtp.parse(page_text, pre_expand=True) 

77 page_data: list[WordEntry] = [] 

78 for level2_node in tree.find_child(NodeKind.LEVEL2): 

79 after_parenthesis = False 

80 lang_code = "unknown" 

81 lang_name = "unknown" 

82 lang_title_cats = {} 

83 for title_content_node in itertools.chain.from_iterable( 83 ↛ 103line 83 didn't jump to line 103 because the loop on line 83 didn't complete

84 level2_node.largs 

85 ): 

86 if isinstance( 

87 title_content_node, str 

88 ) and title_content_node.strip().endswith("("): 

89 after_parenthesis = True 

90 elif ( 

91 isinstance(title_content_node, TemplateNode) 

92 and after_parenthesis 

93 ): 

94 expanded_template = wxr.wtp.parse( 

95 wxr.wtp.node_to_wikitext(title_content_node), 

96 expand_all=True, 

97 ) 

98 for span_tag in expanded_template.find_html("span"): 

99 lang_code = span_tag.attrs.get("id", "") 

100 break 

101 lang_name = clean_node(wxr, lang_title_cats, expanded_template) 

102 break 

103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true

104 wxr.config.capture_language_codes is not None 

105 and lang_code not in wxr.config.capture_language_codes 

106 ): 

107 continue 

108 wxr.wtp.start_section(lang_name) 

109 base_data = WordEntry( 

110 word=wxr.wtp.title, 

111 lang_code=lang_code, 

112 lang=lang_name, 

113 pos="unknown", 

114 categories=lang_title_cats.get("categories", []), 

115 ) 

116 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

117 parse_section(wxr, page_data, base_data, level3_node) 

118 

119 for data in page_data: 

120 if len(data.senses) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true

121 data.senses.append(Sense(tags=["no-gloss"])) 

122 return [m.model_dump(exclude_defaults=True) for m in page_data] 

123 

124 

125def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool: 

126 # return `True` if `WordEntry` has a `Sense` with same POS section 

127 # index number, usually the first number before "." 

128 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0: 

129 return False 

130 if hasattr(word_entry, "senses"): 

131 sense = word_entry.senses[0] 

132 elif isinstance(word_entry, Sense): 132 ↛ 141line 132 didn't jump to line 141 because the condition on line 132 was always true

133 sense = word_entry 

134 # find exact match for index like "1.1" 

135 exact_match = not ( 

136 "," in sense_index or "-" in sense_index or "." not in sense_index 

137 ) 

138 if exact_match: 138 ↛ 141line 138 didn't jump to line 141 because the condition on line 138 was always true

139 return sense_index == sense.sense_index 

140 

141 pos_index_str = sense.sense_index[: sense_index.find(".")] 

142 pos_section_index = 0 

143 if pos_index_str.isdigit(): 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 pos_section_index = int(pos_index_str) 

145 else: 

146 return False 

147 

148 for part_of_index in sense_index.split(","): 

149 part_of_index = part_of_index.strip() 

150 if ( 

151 "." in part_of_index 

152 and pos_index_str == part_of_index[: part_of_index.find(".")] 

153 ): 

154 return True 

155 elif re.fullmatch(r"\d+-\d+", part_of_index): 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true

156 start_str, end_str = part_of_index.split("-") 

157 if int(start_str) <= pos_section_index and pos_section_index <= int( 

158 end_str 

159 ): 

160 return True 

161 

162 return False