Coverage for src/wiktextract/extractor/pl/form.py: 90%

1import re

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, WordEntry

8from .tags import translate_raw_tags

10FORM_SECTIONS = {

11 "zapis": [],

12 "transliteracja": ["transliteration"],

13 "transkrypcja": ["transcription"],

14 "zapisy w ortografiach alternatywnych": ["alternative"],

15 "warianty": ["alternative"],

16 "kody": ["alternative"],

17 "kolejność": ["alternative"],

18 "kreski": ["alternative"],

19 "słowniki": ["alternative"],

20 "hanja": ["hanja"],

21}

24def extract_form_section(

25 wxr: WiktextractContext,

26 page_data: list[WordEntry],

27 base_data: WordEntry,

28 level_node: LevelNode,

29 tags: list[str],

30) -> None:

31 forms = []

32 # get around "preformatted" node

33 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

34 for node in list_item.children:

35 if isinstance(node, str):

36 m = re.search(r"\([\d\s,-.]+\)", node)

37 if m is not None:

38 sense_index = m.group(0).strip("()")

39 roman = node[m.end() :].strip()

40 if roman != "": 40 ↛ 34line 40 didn't jump to line 34 because the condition on line 40 was always true

41 forms.append(

42 Form(

43 form=roman,

44 sense_index=sense_index,

45 tags=tags,

46 )

47 )

48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

49 form = clean_node(wxr, None, node)

50 if form != "": 50 ↛ 34line 50 didn't jump to line 34 because the condition on line 50 was always true

51 forms.append(Form(form=form, tags=tags))

53 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):

54 if t_node.template_name in ["ptrad", "pupr"]:

55 forms.extend(extract_ptrad_template(wxr, t_node, tags))

56 elif t_node.template_name == "translit": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 roman = clean_node(wxr, None, t_node)

58 if roman != "":

59 forms.append(Form(form=roman, tags=tags))

60 elif t_node.template_name.startswith("ortografie"):

61 forms.extend(extract_ortografie_template(wxr, t_node, tags))

62 elif t_node.template_name == "hep": 62 ↛ 53line 62 didn't jump to line 53 because the condition on line 62 was always true

63 forms.extend(extract_hep_template(wxr, t_node, tags))

65 if len(forms) == 0:

66 form = clean_node(wxr, None, level_node.children)

67 if form != "": 67 ↛ 70line 67 didn't jump to line 70 because the condition on line 67 was always true

68 forms.append(Form(form=form, tags=tags))

70 for data in page_data:

71 if data.lang_code == base_data.lang_code: 71 ↛ 70line 71 didn't jump to line 70 because the condition on line 71 was always true

72 data.forms.extend(forms)

73 if len(page_data) == 0:

74 base_data.forms.extend(forms)

77def extract_ptrad_template(

78 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str]

79) -> list[Form]:

80 forms = []

81 expanded_node = wxr.wtp.parse(

82 wxr.wtp.node_to_wikitext(t_node), expand_all=True

83 )

84 raw_tag = ""

85 for span_tag in expanded_node.find_html("span"):

86 if span_tag.attrs.get("class", "") == "short-container":

87 raw_tag = clean_node(wxr, None, span_tag)

88 if span_tag.attrs.get("lang", "") == "zh":

89 word = clean_node(wxr, None, span_tag)

90 if word not in ["", wxr.wtp.title]:

91 form = Form(form=word, tags=tags)

92 if raw_tag != "": 92 ↛ 95line 92 didn't jump to line 95 because the condition on line 92 was always true

93 form.raw_tags.append(raw_tag)

94 translate_raw_tags(form)

95 forms.append(form)

96 return forms

99def extract_ortografie_template(

100 wxr: WiktextractContext,

101 t_node: TemplateNode,

102 tags: list[str],

103) -> list[Form]:

104 forms = []

105 expanded_node = wxr.wtp.parse(

106 wxr.wtp.node_to_wikitext(t_node), expand_all=True

107 )

108 forms.extend(extract_ortografie_list_item(wxr, expanded_node, tags))

109 for list_node in expanded_node.find_child(NodeKind.LIST):

110 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

111 forms.extend(extract_ortografie_list_item(wxr, list_item, tags))

112 return forms

113

114

115def extract_ortografie_list_item(

116 wxr: WiktextractContext, list_item: WikiNode, tags: list[str]

117) -> list[Form]:

118 forms = []

119 for node in list_item.children:

120 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

121 node_str = clean_node(wxr, None, node)

122 if node_str.endswith(":"): 122 ↛ 119line 122 didn't jump to line 119 because the condition on line 122 was always true

123 raw_tag = node_str.strip(": ")

124 elif isinstance(node, str) and node.strip() != "":

125 form = Form(form=node.strip(), tags=tags)

126 if raw_tag != "": 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true

127 form.raw_tags.append(raw_tag)

128 translate_raw_tags(form)

129 forms.append(form)

130 return forms

131

132

133def extract_hep_template(

134 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str]

135) -> list[Form]:

136 forms = []

137 expanded_node = wxr.wtp.parse(

138 wxr.wtp.node_to_wikitext(t_node), expand_all=True

139 )

140 raw_tag = ""

141 for node in expanded_node.children:

142 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

143 node_str = clean_node(wxr, None, node)

144 if node_str.endswith(":"): 144 ↛ 141line 144 didn't jump to line 141 because the condition on line 144 was always true

145 raw_tag = node_str.strip(":")

146 elif isinstance(node, str) and node.strip() != "": 146 ↛ 141line 146 didn't jump to line 141 because the condition on line 146 was always true

147 form = Form(form=node.strip(), tags=tags)

148 if raw_tag != "": 148 ↛ 151line 148 didn't jump to line 151 because the condition on line 148 was always true

149 form.raw_tags.append(raw_tag)

150 translate_raw_tags(form)

151 forms.append(form)

152 return forms