Coverage for src / wiktextract / extractor / cs / pos.py: 92%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .example import extract_example_list_item 

12from .models import AltForm, Sense, WordEntry 

13from .section_titles import POS_DATA 

14from .tags import translate_raw_tags 

15 

16 

17def extract_pos_section( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 base_data: WordEntry, 

21 level_node: LevelNode, 

22 pos_title: str, 

23): 

24 page_data.append(base_data.model_copy(deep=True)) 

25 page_data[-1].pos_title = pos_title 

26 pos_data = POS_DATA[pos_title] 

27 page_data[-1].pos = pos_data["pos"] 

28 base_data.pos = pos_data["pos"] 

29 page_data[-1].tags.extend(pos_data.get("tags", [])) 

30 has_child_section = level_node.contain_node(LEVEL_KIND_FLAGS) 

31 

32 for list_node in level_node.find_child(NodeKind.LIST): 

33 if list_node.sarg != "*": 

34 continue 

35 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

36 if has_child_section: 

37 for italic_node in list_item.find_child(NodeKind.ITALIC): 

38 italic_str = clean_node(wxr, None, italic_node) 

39 for raw_tag in italic_str.split(): 

40 if raw_tag not in ["", "rod"]: 

41 page_data[-1].raw_tags.append(raw_tag) 

42 else: 

43 for link_node in list_item.find_child(NodeKind.LINK): 43 ↛ 35line 43 didn't jump to line 35 because the loop on line 43 didn't complete

44 word = clean_node(wxr, None, link_node) 

45 if word != "": 45 ↛ 43line 45 didn't jump to line 43 because the condition on line 45 was always true

46 page_data[-1].senses.append( 

47 Sense( 

48 glosses=[ 

49 clean_node(wxr, None, list_item.children) 

50 ], 

51 tags=["form-of"], 

52 form_of=[AltForm(word=word)], 

53 ) 

54 ) 

55 if "form-of" not in page_data[-1]: 55 ↛ 57line 55 didn't jump to line 57 because the condition on line 55 was always true

56 page_data[-1].tags.append("form-of") 

57 break 

58 

59 translate_raw_tags(page_data[-1]) 

60 

61 

62def extract_sense_section( 

63 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

64): 

65 for list_node in level_node.find_child(NodeKind.LIST): 

66 if list_node.sarg != "#": 

67 continue 

68 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

69 extract_gloss_list_item(wxr, word_entry, list_item) 

70 

71 

72def extract_gloss_list_item( 

73 wxr: WiktextractContext, 

74 word_entry: WordEntry, 

75 list_item: WikiNode, 

76 parent_sense: Sense | None = None, 

77): 

78 sense = ( 

79 parent_sense.model_copy(deep=True) 

80 if parent_sense is not None 

81 else Sense() 

82 ) 

83 gloss_nodes = [] 

84 for node in list_item.children: 

85 if isinstance(node, TemplateNode) and node.template_name == "Příznaky": 

86 extract_příznaky_template(wxr, sense, node) 

87 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

88 raw_tags = clean_node(wxr, None, node) 

89 if raw_tags.startswith("(") and raw_tags.endswith(")"): 

90 for raw_tag in raw_tags.strip("() ").split(","): 

91 raw_tag = raw_tag.strip() 

92 if raw_tag != "": 92 ↛ 90line 92 didn't jump to line 90 because the condition on line 92 was always true

93 sense.raw_tags.append(raw_tag) 

94 elif node.contain_node(NodeKind.LINK): 94 ↛ 110line 94 didn't jump to line 110 because the condition on line 94 was always true

95 gloss_nodes.append(node) 

96 link_nodes = list( 

97 node.find_child(NodeKind.LINK, with_index=True) 

98 ) 

99 if ( 

100 len(link_nodes) == 1 

101 and link_nodes[0][0] != 0 

102 and link_nodes[0][0] == len(node.children) - 1 

103 ): 

104 word = clean_node(wxr, None, link_nodes[0][1]) 

105 if word != "": 105 ↛ 108line 105 didn't jump to line 108 because the condition on line 105 was always true

106 sense.form_of.append(AltForm(word=word)) 

107 sense.tags.append("form-of") 

108 break 

109 else: 

110 gloss_nodes.append(node) 

111 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

112 gloss_nodes.append(node) 

113 

114 gloss = clean_node(wxr, sense, gloss_nodes) 

115 if gloss != "": 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true

116 sense.glosses.append(gloss) 

117 translate_raw_tags(sense) 

118 word_entry.senses.append(sense) 

119 

120 for child_list in list_item.find_child(NodeKind.LIST): 

121 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

122 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

123 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

124 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 124 ↛ 120line 124 didn't jump to line 120 because the condition on line 124 was always true

125 (":", "*") 

126 ): 

127 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

128 extract_example_list_item(wxr, sense, child_list_item) 

129 

130 

131def extract_příznaky_template( 

132 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

133): 

134 # https://cs.wiktionary.org/wiki/Šablona:Příznaky 

135 text = clean_node(wxr, sense, t_node).strip("() ") 

136 for raw_tag in text.split(","): 

137 raw_tag = raw_tag.strip() 

138 if raw_tag != "": 138 ↛ 136line 138 didn't jump to line 136 because the condition on line 138 was always true

139 sense.raw_tags.append(raw_tag) 

140 

141 

142def extract_note_section( 

143 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

144): 

145 word_entry.note = clean_node( 

146 wxr, 

147 word_entry, 

148 list(level_node.invert_find_child(LEVEL_KIND_FLAGS, True)), 

149 )