Coverage for src/wiktextract/extractor/pl/form.py: 90%

96 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10FORM_SECTIONS = { 

11 "zapis": [], 

12 "transliteracja": ["transliteration"], 

13 "transkrypcja": ["transcription"], 

14 "zapisy w ortografiach alternatywnych": ["alternative"], 

15 "warianty": ["alternative"], 

16 "kody": ["alternative"], 

17 "kolejność": ["alternative"], 

18 "kreski": ["alternative"], 

19 "słowniki": ["alternative"], 

20 "hanja": ["hanja"], 

21} 

22 

23 

24def extract_form_section( 

25 wxr: WiktextractContext, 

26 page_data: list[WordEntry], 

27 base_data: WordEntry, 

28 level_node: LevelNode, 

29 tags: list[str], 

30) -> None: 

31 forms = [] 

32 # get around "preformatted" node 

33 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

34 for node in list_item.children: 

35 if isinstance(node, str): 

36 m = re.search(r"\([\d\s,-.]+\)", node) 

37 if m is not None: 

38 sense_index = m.group(0).strip("()") 

39 roman = node[m.end() :].strip() 

40 if roman != "": 40 ↛ 34line 40 didn't jump to line 34 because the condition on line 40 was always true

41 forms.append( 

42 Form( 

43 form=roman, 

44 sense_index=sense_index, 

45 tags=tags, 

46 ) 

47 ) 

48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

49 form = clean_node(wxr, None, node) 

50 if form != "": 50 ↛ 34line 50 didn't jump to line 34 because the condition on line 50 was always true

51 forms.append(Form(form=form, tags=tags)) 

52 

53 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

54 if t_node.template_name in ["ptrad", "pupr"]: 

55 forms.extend(extract_ptrad_template(wxr, t_node, tags)) 

56 elif t_node.template_name == "translit": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 roman = clean_node(wxr, None, t_node) 

58 if roman != "": 

59 forms.append(Form(form=roman, tags=tags)) 

60 elif t_node.template_name.startswith("ortografie"): 

61 forms.extend(extract_ortografie_template(wxr, t_node, tags)) 

62 elif t_node.template_name == "hep": 62 ↛ 53line 62 didn't jump to line 53 because the condition on line 62 was always true

63 forms.extend(extract_hep_template(wxr, t_node, tags)) 

64 

65 if len(forms) == 0: 

66 form = clean_node(wxr, None, level_node.children) 

67 if form != "": 67 ↛ 70line 67 didn't jump to line 70 because the condition on line 67 was always true

68 forms.append(Form(form=form, tags=tags)) 

69 

70 for data in page_data: 

71 if data.lang_code == base_data.lang_code: 71 ↛ 70line 71 didn't jump to line 70 because the condition on line 71 was always true

72 data.forms.extend(forms) 

73 if len(page_data) == 0: 

74 base_data.forms.extend(forms) 

75 

76 

77def extract_ptrad_template( 

78 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str] 

79) -> list[Form]: 

80 forms = [] 

81 expanded_node = wxr.wtp.parse( 

82 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

83 ) 

84 raw_tag = "" 

85 for span_tag in expanded_node.find_html("span"): 

86 if span_tag.attrs.get("class", "") == "short-container": 

87 raw_tag = clean_node(wxr, None, span_tag) 

88 if span_tag.attrs.get("lang", "") == "zh": 

89 word = clean_node(wxr, None, span_tag) 

90 if word not in ["", wxr.wtp.title]: 

91 form = Form(form=word, tags=tags) 

92 if raw_tag != "": 92 ↛ 95line 92 didn't jump to line 95 because the condition on line 92 was always true

93 form.raw_tags.append(raw_tag) 

94 translate_raw_tags(form) 

95 forms.append(form) 

96 return forms 

97 

98 

99def extract_ortografie_template( 

100 wxr: WiktextractContext, 

101 t_node: TemplateNode, 

102 tags: list[str], 

103) -> list[Form]: 

104 forms = [] 

105 expanded_node = wxr.wtp.parse( 

106 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

107 ) 

108 forms.extend(extract_ortografie_list_item(wxr, expanded_node, tags)) 

109 for list_node in expanded_node.find_child(NodeKind.LIST): 

110 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

111 forms.extend(extract_ortografie_list_item(wxr, list_item, tags)) 

112 return forms 

113 

114 

115def extract_ortografie_list_item( 

116 wxr: WiktextractContext, list_item: WikiNode, tags: list[str] 

117) -> list[Form]: 

118 forms = [] 

119 for node in list_item.children: 

120 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

121 node_str = clean_node(wxr, None, node) 

122 if node_str.endswith(":"): 122 ↛ 119line 122 didn't jump to line 119 because the condition on line 122 was always true

123 raw_tag = node_str.strip(": ") 

124 elif isinstance(node, str) and node.strip() != "": 

125 form = Form(form=node.strip(), tags=tags) 

126 if raw_tag != "": 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true

127 form.raw_tags.append(raw_tag) 

128 translate_raw_tags(form) 

129 forms.append(form) 

130 return forms 

131 

132 

133def extract_hep_template( 

134 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str] 

135) -> list[Form]: 

136 forms = [] 

137 expanded_node = wxr.wtp.parse( 

138 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

139 ) 

140 raw_tag = "" 

141 for node in expanded_node.children: 

142 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

143 node_str = clean_node(wxr, None, node) 

144 if node_str.endswith(":"): 144 ↛ 141line 144 didn't jump to line 141 because the condition on line 144 was always true

145 raw_tag = node_str.strip(":") 

146 elif isinstance(node, str) and node.strip() != "": 146 ↛ 141line 146 didn't jump to line 141 because the condition on line 146 was always true

147 form = Form(form=node.strip(), tags=tags) 

148 if raw_tag != "": 148 ↛ 151line 148 didn't jump to line 151 because the condition on line 148 was always true

149 form.raw_tags.append(raw_tag) 

150 translate_raw_tags(form) 

151 forms.append(form) 

152 return forms