Coverage for src/wiktextract/extractor/cs/declension.py: 98%

61 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import capture_text_in_parentheses 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11DECLENSION_SECTION_TAGS = { 

12 "skloňování mužské": ["masculine"], 

13 "skloňování ženské": ["feminine"], 

14} 

15 

16 

17def extract_declension_section( 

18 wxr: WiktextractContext, 

19 word_entry: WordEntry, 

20 level_node: LevelNode, 

21 section_title: str, 

22): 

23 section_tags = DECLENSION_SECTION_TAGS.get(section_title, []) 

24 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

25 if t_node.template_name.startswith( 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true

26 ("Substantivum ", "Adjektivum ", "Stupňování ", "Sloveso ") 

27 ): 

28 extract_substantivum_template(wxr, word_entry, t_node, section_tags) 

29 

30 

31@dataclass 

32class TableHeader: 

33 text: str 

34 colspan: int 

35 rowspan: int 

36 col_index: int 

37 row_index: int 

38 

39 

40def extract_substantivum_template( 

41 wxr: WiktextractContext, 

42 word_entry: WordEntry, 

43 t_node: TemplateNode, 

44 section_tags: list[str], 

45): 

46 # https://cs.wiktionary.org/wiki/Šablona:Substantivum_(cs) 

47 expanded_node = wxr.wtp.parse( 

48 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

49 ) 

50 clean_node(wxr, word_entry, expanded_node) 

51 for table in expanded_node.find_child(NodeKind.TABLE): 

52 col_headers = [] 

53 row_headers = [] 

54 table_caption = "" 

55 for caption_node in table.find_child(NodeKind.TABLE_CAPTION): 

56 table_caption = clean_node(wxr, None, caption_node.children) 

57 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

58 is_column_header = not row.contain_node(NodeKind.TABLE_CELL) 

59 col_index = 0 

60 for header in col_headers if is_column_header else row_headers: 

61 if ( 

62 header.rowspan > 1 

63 and header.row_index <= row_index 

64 and header.row_index + header.rowspan > row_index 

65 ): 

66 col_index += header.colspan 

67 for cell in row.find_child( 

68 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

69 ): 

70 cell_text = clean_node(wxr, None, cell) 

71 colspan = int(cell.attrs.get("colspan", "1")) 

72 rowspan = int(cell.attrs.get("rowspan", "1")) 

73 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

74 if is_column_header: 

75 col_headers.append( 

76 TableHeader( 

77 cell_text, 

78 colspan, 

79 rowspan, 

80 col_index, 

81 row_index, 

82 ) 

83 ) 

84 elif not is_column_header: 84 ↛ 123line 84 didn't jump to line 123 because the condition on line 84 was always true

85 row_headers.append( 

86 TableHeader( 

87 cell_text, 

88 colspan, 

89 rowspan, 

90 col_index, 

91 row_index, 

92 ) 

93 ) 

94 else: 

95 for word in cell_text.split(" / "): 

96 cell_tags, word = capture_text_in_parentheses(word) 

97 word = word.strip() 

98 if word in ["", "—", wxr.wtp.title]: 

99 continue 

100 form = Form( 

101 form=word, tags=section_tags, raw_tags=cell_tags 

102 ) 

103 if table_caption != "": 

104 form.raw_tags.append(table_caption) 

105 for row_header in row_headers: 

106 if ( 

107 row_header.text != "" 

108 and row_header.row_index <= row_index 

109 and row_header.row_index + row_header.rowspan 

110 > row_index 

111 ): 

112 form.raw_tags.append(row_header.text) 

113 for col_header in col_headers: 

114 if ( 

115 col_header.text != "" 

116 and col_header.col_index <= col_index 

117 and col_header.col_index + col_header.colspan 

118 > col_index 

119 ): 

120 form.raw_tags.append(col_header.text) 

121 translate_raw_tags(form) 

122 word_entry.forms.append(form) 

123 col_index += colspan