Coverage for src/wiktextract/extractor/cs/pos.py: 94%

75 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from wikitextprocessor.parser import ( 

2 LEVEL_KIND_FLAGS, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .example import extract_example_list_item 

12from .models import AltForm, Sense, WordEntry 

13from .section_titles import POS_DATA 

14from .tags import translate_raw_tags 

15 

16 

17def extract_pos_section( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 base_data: WordEntry, 

21 level_node: LevelNode, 

22 pos_title: str, 

23): 

24 page_data.append(base_data.model_copy(deep=True)) 

25 page_data[-1].pos_title = pos_title 

26 pos_data = POS_DATA[pos_title] 

27 page_data[-1].pos = pos_data["pos"] 

28 base_data.pos = pos_data["pos"] 

29 page_data[-1].tags.extend(pos_data.get("tags", [])) 

30 

31 for list_node in level_node.find_child(NodeKind.LIST): 

32 if list_node.sarg != "*": 

33 continue 

34 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

35 for italic_node in list_item.find_child(NodeKind.ITALIC): 

36 italic_str = clean_node(wxr, None, italic_node) 

37 for raw_tag in italic_str.split(): 

38 if raw_tag not in ["", "rod"]: 

39 page_data[-1].raw_tags.append(raw_tag) 

40 

41 translate_raw_tags(page_data[-1]) 

42 

43 

44def extract_sense_section( 

45 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

46): 

47 for list_node in level_node.find_child(NodeKind.LIST): 

48 if list_node.sarg != "#": 

49 continue 

50 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

51 extract_gloss_list_item(wxr, word_entry, list_item) 

52 

53 

54def extract_gloss_list_item( 

55 wxr: WiktextractContext, 

56 word_entry: WordEntry, 

57 list_item: WikiNode, 

58 parent_sense: Sense | None = None, 

59): 

60 sense = ( 

61 parent_sense.model_copy(deep=True) 

62 if parent_sense is not None 

63 else Sense() 

64 ) 

65 gloss_nodes = [] 

66 for node in list_item.children: 

67 if isinstance(node, TemplateNode) and node.template_name == "Příznaky": 

68 extract_příznaky_template(wxr, sense, node) 

69 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

70 raw_tags = clean_node(wxr, None, node) 

71 if raw_tags.startswith("(") and raw_tags.endswith(")"): 

72 for raw_tag in raw_tags.strip("() ").split(","): 

73 raw_tag = raw_tag.strip() 

74 if raw_tag != "": 74 ↛ 72line 74 didn't jump to line 72 because the condition on line 74 was always true

75 sense.raw_tags.append(raw_tag) 

76 elif node.contain_node(NodeKind.LINK): 76 ↛ 92line 76 didn't jump to line 92 because the condition on line 76 was always true

77 gloss_nodes.append(node) 

78 link_nodes = list( 

79 node.find_child(NodeKind.LINK, with_index=True) 

80 ) 

81 if ( 

82 len(link_nodes) == 1 

83 and link_nodes[0][0] != 0 

84 and link_nodes[0][0] == len(node.children) - 1 

85 ): 

86 word = clean_node(wxr, None, link_nodes[0][1]) 

87 if word != "": 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true

88 sense.form_of.append(AltForm(word=word)) 

89 sense.tags.append("form-of") 

90 break 

91 else: 

92 gloss_nodes.append(node) 

93 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

94 gloss_nodes.append(node) 

95 

96 gloss = clean_node(wxr, sense, gloss_nodes) 

97 if gloss != "": 97 ↛ 102line 97 didn't jump to line 102 because the condition on line 97 was always true

98 sense.glosses.append(gloss) 

99 translate_raw_tags(sense) 

100 word_entry.senses.append(sense) 

101 

102 for child_list in list_item.find_child(NodeKind.LIST): 

103 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 

104 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

105 extract_gloss_list_item(wxr, word_entry, child_list_item, sense) 

106 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 106 ↛ 102line 106 didn't jump to line 102 because the condition on line 106 was always true

107 (":", "*") 

108 ): 

109 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

110 extract_example_list_item(wxr, sense, child_list_item) 

111 

112 

113def extract_příznaky_template( 

114 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

115): 

116 # https://cs.wiktionary.org/wiki/Šablona:Příznaky 

117 text = clean_node(wxr, sense, t_node).strip("() ") 

118 for raw_tag in text.split(","): 

119 raw_tag = raw_tag.strip() 

120 if raw_tag != "": 120 ↛ 118line 120 didn't jump to line 118 because the condition on line 120 was always true

121 sense.raw_tags.append(raw_tag) 

122 

123 

124def extract_note_section( 

125 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

126): 

127 word_entry.note = clean_node( 

128 wxr, 

129 word_entry, 

130 list(level_node.invert_find_child(LEVEL_KIND_FLAGS, True)), 

131 )