Coverage for src/wiktextract/extractor/pl/page.py: 72%

1import itertools

2import re

3from typing import Any

5from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode

7from ...page import clean_node

8from ...wxr_context import WiktextractContext

9from .etymology import extract_etymology_section

10from .example import extract_example_section

11from .inflection import extract_inflection_section

12from .linkage import LINKAGE_TYPES, extract_linkage_section

13from .models import Form, Sense, WordEntry

14from .note import extract_note_section

15from .pos import extract_pos_section

16from .sound import extract_sound_section

17from .translation import extract_translation_section

20def parse_section(

21 wxr: WiktextractContext,

22 page_data: list[WordEntry],

23 base_data: WordEntry,

24 level_node: WikiNode,

25) -> None:

26 # title templates

27 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł

28 title_text = clean_node(wxr, None, level_node.largs)

29 wxr.wtp.start_subsection(title_text)

30 if title_text == "wymowa" and wxr.config.capture_pronunciation: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 extract_sound_section(wxr, base_data, level_node)

32 elif title_text == "znaczenia":

33 extract_pos_section(wxr, page_data, base_data, level_node)

34 elif title_text == "przykłady":

35 extract_example_section(wxr, page_data, base_data, level_node)

36 elif title_text == "etymologia" and wxr.config.capture_etymologies:

37 extract_etymology_section(wxr, page_data, base_data, level_node)

38 elif title_text == "tłumaczenia" and wxr.config.capture_translations: 38 ↛ 42line 38 didn't jump to line 42 because the condition on line 38 was always true

39 extract_translation_section(

40 wxr, page_data, level_node, base_data.lang_code

41 )

42 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections:

43 extract_linkage_section(

44 wxr,

45 page_data,

46 level_node,

47 LINKAGE_TYPES[title_text],

48 base_data.lang_code,

49 )

50 elif title_text == "uwagi":

51 extract_note_section(wxr, page_data, base_data, level_node)

52 elif title_text == "odmiana" and wxr.config.capture_inflections:

53 extract_inflection_section(

54 wxr, page_data, base_data.lang_code, level_node

55 )

56 elif title_text == "zapis":

57 extract_zapis_section(wxr, base_data, level_node)

58 elif title_text == "transliteracja":

59 extract_transliteracja_section(wxr, base_data, level_node)

62def extract_zapis_section(

63 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

64) -> None:

65 # get around "preformatted" node

66 for node in level_node.find_child_recursively(NodeKind.TEMPLATE):

67 if node.template_name == "ptrad": 67 ↛ 66line 67 didn't jump to line 66 because the condition on line 67 was always true

68 form_text = clean_node(

69 wxr, None, node.template_parameters.get(1, "")

70 )

71 if form_text != "": 71 ↛ 66line 71 didn't jump to line 66 because the condition on line 71 was always true

72 base_data.forms.append(

73 Form(form=form_text, tags=["Traditional Chinese"])

74 )

77def extract_transliteracja_section(

78 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

79) -> None:

80 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

81 for node in list_item.children:

82 if isinstance(node, str): 82 ↛ 81line 82 didn't jump to line 81 because the condition on line 82 was always true

83 m = re.search(r"\([\d\s,-.]+\)", node)

84 if m is not None: 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true

85 sense_index = m.group(0).strip("()")

86 roman = node[m.end() :].strip()

87 if roman != "": 87 ↛ 81line 87 didn't jump to line 81 because the condition on line 87 was always true

88 base_data.forms.append(

89 Form(

90 form=roman,

91 sense_index=sense_index,

92 tags=["romanization"],

93 )

94 )

97def parse_page(

98 wxr: WiktextractContext, page_title: str, page_text: str

99) -> list[dict[str, Any]]:

100 # page layout

101 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł

102 wxr.wtp.start_page(page_title)

103 tree = wxr.wtp.parse(page_text, pre_expand=True)

104 page_data: list[WordEntry] = []

105 for level2_node in tree.find_child(NodeKind.LEVEL2):

106 after_parenthesis = False

107 lang_code = "unknown"

108 lang_name = "unknown"

109 lang_title_cats = {}

110 for title_content_node in itertools.chain.from_iterable( 110 ↛ 130line 110 didn't jump to line 130 because the loop on line 110 didn't complete

111 level2_node.largs

112 ):

113 if isinstance(

114 title_content_node, str

115 ) and title_content_node.strip().endswith("("):

116 after_parenthesis = True

117 elif (

118 isinstance(title_content_node, TemplateNode)

119 and after_parenthesis

120 ):

121 expanded_template = wxr.wtp.parse(

122 wxr.wtp.node_to_wikitext(title_content_node),

123 expand_all=True,

124 )

125 for span_tag in expanded_template.find_html("span"):

126 lang_code = span_tag.attrs.get("id", "")

127 break

128 lang_name = clean_node(wxr, lang_title_cats, expanded_template)

129 break

130 if ( 130 ↛ 134line 130 didn't jump to line 134 because the condition on line 130 was never true

131 wxr.config.capture_language_codes is not None

132 and lang_code not in wxr.config.capture_language_codes

133 ):

134 continue

135 wxr.wtp.start_section(lang_name)

136 base_data = WordEntry(

137 word=wxr.wtp.title,

138 lang_code=lang_code,

139 lang=lang_name,

140 pos="unknown",

141 categories=lang_title_cats.get("categories", []),

142 )

143 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

144 parse_section(wxr, page_data, base_data, level3_node)

145

146 for data in page_data:

147 if len(data.senses) == 0: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 data.senses.append(Sense(tags=["no-gloss"]))

149 return [m.model_dump(exclude_defaults=True) for m in page_data]

150

151

152def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool:

153 # return `True` if `WordEntry` has a `Sense` with same POS section

154 # index number, usually the first number before "."

155 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0:

156 return False

157 if hasattr(word_entry, "senses"): 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true

158 sense = word_entry.senses[0]

159 elif isinstance(word_entry, Sense):

160 sense = word_entry

161 # find exact match for index like "1.1"

162 exact_match = not (

163 "," in sense_index or "-" in sense_index or "." not in sense_index

164 )

165 if exact_match:

166 return sense_index == sense.sense_index

167

168 pos_index_str = sense.sense_index[: sense_index.find(".")]

169 pos_section_index = 0

170 if pos_index_str.isdigit(): 170 ↛ 173line 170 didn't jump to line 173 because the condition on line 170 was always true

171 pos_section_index = int(pos_index_str)

172 else:

173 return False

174

175 for part_of_index in sense_index.split(","):

176 part_of_index = part_of_index.strip()

177 if (

178 "." in part_of_index

179 and pos_index_str == part_of_index[: part_of_index.find(".")]

180 ):

181 return True

182 elif re.fullmatch(r"\d+-\d+", part_of_index): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 start_str, end_str = part_of_index.split("-")

184 if int(start_str) <= pos_section_index and pos_section_index <= int(

185 end_str

186 ):

187 return True

188

189 return False