Coverage for src/wiktextract/extractor/pl/form.py: 90%

97 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_SECTIONS = { 

11 "zapis": [], 

12 "transliteracja": ["transliteration"], 

13 "transkrypcja": ["transcription"], 

14 "zapisy w ortografiach alternatywnych": ["alternative"], 

15 "warianty": ["alternative"], 

16 "kody": ["alternative"], 

17 "kolejność": ["alternative"], 

18 "kreski": ["alternative"], 

19 "słowniki": ["alternative"], 

20 "hanja": ["hanja"], 

21} 

22 

23 

24def extract_form_section( 

25 wxr: WiktextractContext, 

26 page_data: list[WordEntry], 

27 base_data: WordEntry, 

28 level_node: LevelNode, 

29 tags: list[str], 

30) -> None: 

31 forms = [] 

32 # get around "preformatted" node 

33 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

34 for node in list_item.children: 

35 if isinstance(node, str): 

36 m = re.search(r"\([\d\s,-.]+\)", node) 

37 if m is not None: 

38 sense_index = m.group(0).strip("()") 

39 roman = node[m.end() :].strip() 

40 if roman != "": 40 ↛ 34line 40 didn't jump to line 34 because the condition on line 40 was always true

41 forms.append( 

42 Form( 

43 form=roman, 

44 sense_index=sense_index, 

45 tags=tags, 

46 ) 

47 ) 

48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

49 form = clean_node(wxr, None, node) 

50 if form != "": 50 ↛ 34line 50 didn't jump to line 34 because the condition on line 50 was always true

51 forms.append(Form(form=form, tags=tags)) 

52 

53 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

54 if t_node.template_name in ["ptrad", "pupr"]: 

55 forms.extend(extract_ptrad_template(wxr, t_node, tags)) 

56 elif t_node.template_name == "translit": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 roman = clean_node(wxr, None, t_node) 

58 if roman != "": 

59 forms.append(Form(form=roman, tags=tags)) 

60 elif t_node.template_name.startswith("ortografie"): 

61 forms.extend(extract_ortografie_template(wxr, t_node, tags)) 

62 elif t_node.template_name == "hep": 62 ↛ 53line 62 didn't jump to line 53 because the condition on line 62 was always true

63 forms.extend(extract_hep_template(wxr, t_node, tags)) 

64 

65 if len(forms) == 0: 

66 form = clean_node(wxr, None, level_node.children) 

67 if form != "": 67 ↛ 70line 67 didn't jump to line 70 because the condition on line 67 was always true

68 forms.append(Form(form=form, tags=tags)) 

69 

70 for data in page_data: 

71 if data.lang_code == base_data.lang_code: 71 ↛ 70line 71 didn't jump to line 70 because the condition on line 71 was always true

72 data.forms.extend(forms) 

73 if len(page_data) == 0: 

74 base_data.forms.extend(forms) 

75 

76 

77def extract_ptrad_template( 

78 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str] 

79) -> list[Form]: 

80 forms = [] 

81 expanded_node = wxr.wtp.parse( 

82 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

83 ) 

84 raw_tag = "" 

85 for span_tag in expanded_node.find_html("span"): 

86 if span_tag.attrs.get("class", "") == "short-container": 

87 raw_tag = clean_node(wxr, None, span_tag) 

88 if span_tag.attrs.get("lang", "") == "zh": 

89 word = clean_node(wxr, None, span_tag) 

90 if word not in ["", wxr.wtp.title]: 

91 form = Form(form=word, tags=tags) 

92 if raw_tag != "": 92 ↛ 95line 92 didn't jump to line 95 because the condition on line 92 was always true

93 form.raw_tags.append(raw_tag) 

94 translate_raw_tags(form) 

95 forms.append(form) 

96 return forms 

97 

98 

99def extract_ortografie_template( 

100 wxr: WiktextractContext, 

101 t_node: TemplateNode, 

102 tags: list[str], 

103) -> list[Form]: 

104 forms = [] 

105 expanded_node = wxr.wtp.parse( 

106 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

107 ) 

108 forms.extend(extract_ortografie_list_item(wxr, expanded_node, tags)) 

109 for list_node in expanded_node.find_child(NodeKind.LIST): 

110 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

111 forms.extend(extract_ortografie_list_item(wxr, list_item, tags)) 

112 return forms 

113 

114 

115def extract_ortografie_list_item( 

116 wxr: WiktextractContext, list_item: WikiNode, tags: list[str] 

117) -> list[Form]: 

118 forms = [] 

119 raw_tag = "" 

120 for node in list_item.children: 

121 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

122 node_str = clean_node(wxr, None, node) 

123 if node_str != "": 123 ↛ 120line 123 didn't jump to line 120 because the condition on line 123 was always true

124 raw_tag = node_str.strip(": ") 

125 elif isinstance(node, str) and node.strip() != "": 

126 form = Form(form=node.strip(":\n "), tags=tags) 

127 if raw_tag != "": 127 ↛ 130line 127 didn't jump to line 130 because the condition on line 127 was always true

128 form.raw_tags.append(raw_tag) 

129 translate_raw_tags(form) 

130 forms.append(form) 

131 return forms 

132 

133 

134def extract_hep_template( 

135 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str] 

136) -> list[Form]: 

137 forms = [] 

138 expanded_node = wxr.wtp.parse( 

139 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

140 ) 

141 raw_tag = "" 

142 for node in expanded_node.children: 

143 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

144 node_str = clean_node(wxr, None, node) 

145 if node_str.endswith(":"): 145 ↛ 142line 145 didn't jump to line 142 because the condition on line 145 was always true

146 raw_tag = node_str.strip(":") 

147 elif isinstance(node, str) and node.strip() != "": 147 ↛ 142line 147 didn't jump to line 142 because the condition on line 147 was always true

148 form = Form(form=node.strip(), tags=tags) 

149 if raw_tag != "": 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true

150 form.raw_tags.append(raw_tag) 

151 translate_raw_tags(form) 

152 forms.append(form) 

153 return forms