Coverage for src/wiktextract/extractor/ku/form_table.py: 92%

99 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10from .tewandin import extract_tewandin_page 

11 

12 

13def extract_ku_tewîn_nav_template( 

14 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

15) -> None: 

16 # https://ku.wiktionary.org/wiki/Şablon:ku-tewîn-nav 

17 expanded_node = wxr.wtp.parse( 

18 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

19 ) 

20 gender_tags = [] 

21 gender_arg = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

22 if gender_arg == "mê": 

23 gender_tags = ["feminine"] 

24 elif gender_arg == "nêr": 24 ↛ 25line 24 didn't jump to line 25 because the condition on line 24 was never true

25 gender_tags = ["masculine"] 

26 for table_node in expanded_node.find_child(NodeKind.TABLE): 

27 row_header = "" 

28 col_headers = [] 

29 shared_tags = [] 

30 for row in table_node.find_child(NodeKind.TABLE_ROW): 

31 col_index = 0 

32 for cell in row.find_child( 

33 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

34 ): 

35 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

36 header_str = clean_node(wxr, None, cell) 

37 if len(row.children) == 1: 

38 if header_str.endswith(" nebinavkirî"): 

39 shared_tags = ["indefinite"] 

40 elif header_str.endswith(" binavkirî"): 40 ↛ 32line 40 didn't jump to line 32 because the condition on line 40 was always true

41 shared_tags = ["definite"] 

42 elif row.contain_node(NodeKind.TABLE_CELL): 

43 row_header = header_str 

44 elif header_str not in ["Rewş", ""]: 

45 col_headers.append(header_str) 

46 elif len(row.children) == 1: 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 continue 

48 else: 

49 for form_str in clean_node(wxr, None, cell).splitlines(): 

50 if form_str not in ["", wxr.wtp.title]: 

51 form = Form( 

52 form=form_str, tags=gender_tags + shared_tags 

53 ) 

54 if row_header != "": 

55 form.raw_tags.append(row_header) 

56 if col_index < len(col_headers): 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 form.raw_tags.append(col_headers[col_index]) 

58 translate_raw_tags(form) 

59 word_entry.forms.append(form) 

60 col_index += 1 

61 

62 

63@dataclass 

64class TableHeader: 

65 text: str 

66 row_index: int = 0 

67 rowspan: int = 0 

68 col_index: int = 0 

69 colspan: int = 0 

70 

71 

72def extract_ku_tewîn_lk_template( 

73 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

74) -> None: 

75 # https://ku.wiktionary.org/wiki/Şablon:ku-tewîn-lk 

76 expanded_node = wxr.wtp.parse( 

77 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

78 ) 

79 for table_node in expanded_node.find_child(NodeKind.TABLE): 

80 row_index = 0 

81 shared_tags = [] 

82 row_headers = [] 

83 for row in table_node.find_child(NodeKind.TABLE_ROW): 

84 if len(row.children) == 1: 

85 row_str = clean_node(wxr, None, row.children) 

86 clear_values = False 

87 if row_str.endswith(" gerguhêz)"): 

88 shared_tags = ["transitive"] 

89 clear_values = True 

90 elif row_str.endswith(" negerguhêz)"): 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 shared_tags = ["intransitive"] 

92 clear_values = True 

93 elif row_str.startswith("Rehê dema "): 

94 clear_values = True 

95 elif row_str.startswith("Formên din:"): 

96 extract_tewandin_page(wxr, word_entry, row_str[11:].strip()) 

97 if clear_values: 

98 row_index = 0 

99 row_headers.clear() 

100 continue 

101 for header_cell in row.find_child(NodeKind.TABLE_HEADER_CELL): 

102 rowspan = 1 

103 rowspan_str = header_cell.attrs.get("rowspan", "1") 

104 if re.fullmatch(r"\d+", rowspan_str): 104 ↛ 106line 104 didn't jump to line 106 because the condition on line 104 was always true

105 rowspan = int(rowspan_str) 

106 row_headers.append( 

107 TableHeader( 

108 text=clean_node(wxr, None, header_cell), 

109 rowspan=rowspan, 

110 row_index=row_index, 

111 ) 

112 ) 

113 for col_index, cell in enumerate( 

114 row.find_child(NodeKind.TABLE_CELL) 

115 ): 

116 cell_str = clean_node(wxr, None, cell) 

117 if cell_str == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 continue 

119 if col_index == 0: 

120 row_headers.append( 

121 TableHeader( 

122 text=cell_str, rowspan=1, row_index=row_index 

123 ) 

124 ) 

125 else: 

126 for form_str in cell_str.split("/"): 

127 form_str = form_str.strip() 

128 if form_str not in ["", wxr.wtp.title]: 128 ↛ 126line 128 didn't jump to line 126 because the condition on line 128 was always true

129 form = Form(form=form_str, tags=shared_tags) 

130 for header in row_headers: 

131 if ( 131 ↛ 130line 131 didn't jump to line 130 because the condition on line 131 was always true

132 row_index >= header.row_index 

133 and row_index 

134 < header.row_index + header.rowspan 

135 ): 

136 form.raw_tags.append(header.text) 

137 translate_raw_tags(form) 

138 word_entry.forms.append(form) 

139 

140 row_index += 1