Coverage for src/wiktextract/extractor/zh/inflection.py: 92%

77 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1from itertools import zip_longest 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8from .tags import translate_raw_tags 

9 

10# https://zh.wiktionary.org/wiki/Category:日語變格表模板 

11JAPANESE_INFLECTION_TEMPLATE_PREFIXES = ( 

12 "ja-i", 

13 "ja-adj-infl", 

14 "ja-conj-bungo", 

15 "ja-go", 

16 "ja-honorific", 

17 "ja-ichi", 

18 "ja-kuru", 

19 "ja-suru", 

20 "ja-verbconj", 

21 "ja-zuru", 

22) 

23 

24 

25def extract_inflections( 

26 wxr: WiktextractContext, 

27 page_data: list[WordEntry], 

28 level_node: WikiNode, 

29) -> None: 

30 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

31 if t_node.template_name.lower().startswith( 31 ↛ 30line 31 didn't jump to line 30 because the condition on line 31 was always true

32 JAPANESE_INFLECTION_TEMPLATE_PREFIXES 

33 ): 

34 expanded_template = wxr.wtp.parse( 

35 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

36 ) 

37 for table_node in expanded_template.find_child_recursively( 

38 NodeKind.TABLE 

39 ): 

40 extract_ja_inf_table(wxr, page_data, table_node) 

41 

42 

43def extract_ja_inf_table( 

44 wxr: WiktextractContext, 

45 page_data: list[WordEntry], 

46 table_node: WikiNode, 

47) -> None: 

48 table_header = "" 

49 small_tags_dict = {} 

50 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

51 if len(list(row_node.filter_empty_str_child())) == 1: 

52 has_small_tag = False 

53 # table end tags 

54 for small_tag in row_node.find_html_recursively("small"): 

55 has_small_tag = True 

56 tag_text = clean_node(wxr, None, small_tag) 

57 if tag_text.startswith(("¹", "²")): 57 ↛ 54line 57 didn't jump to line 54 because the condition on line 57 was always true

58 small_tags_dict[tag_text[0]] = tag_text[1:].strip() 

59 if not has_small_tag: 

60 table_header = clean_node(wxr, None, row_node.children) 

61 else: 

62 form_list = [] 

63 hiragana_list = [] 

64 roman_list = [] 

65 raw_tags = [] 

66 small_tags = [] 

67 cell_node_index = 0 

68 for row_child in row_node.find_child( 

69 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

70 ): 

71 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

72 for line in clean_node(wxr, None, row_child).splitlines(): 

73 line = line.strip("() ") 

74 if len(line) > 0: 74 ↛ 72line 74 didn't jump to line 72 because the condition on line 74 was always true

75 raw_tags.append(line) 

76 elif row_child.kind == NodeKind.TABLE_CELL: 76 ↛ 68line 76 didn't jump to line 68 because the condition on line 76 was always true

77 if cell_node_index >= 3: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 break 

79 for bold_node in row_child.find_child(NodeKind.BOLD): 

80 # is row header 

81 raw_tags.append(clean_node(wxr, None, bold_node)) 

82 continue 

83 for span_tag in row_child.find_html("span"): 

84 span_text = clean_node(wxr, None, row_child) 

85 span_class = span_tag.attrs.get("class", "") 

86 for line in span_text.splitlines(): 

87 if line == "-": 

88 continue 

89 if line.endswith(("¹", "²")): 

90 if cell_node_index == 0: 

91 small_tags.append(line[-1]) 

92 line = line[:-1] 

93 if span_class == "Latn": 

94 roman_list.append(line) 

95 elif span_class == "Jpan": 95 ↛ 86line 95 didn't jump to line 86 because the condition on line 95 was always true

96 if cell_node_index == 0: 

97 form_list.append(line) 

98 elif cell_node_index == 1: 98 ↛ 86line 98 didn't jump to line 86 because the condition on line 98 was always true

99 hiragana_list.append(line) 

100 cell_node_index += 1 

101 break 

102 

103 for form, hiragana, roman, small_tag in zip_longest( 

104 form_list, hiragana_list, roman_list, small_tags 

105 ): 

106 if form in [None, "", "-", wxr.wtp.title]: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 continue 

108 form_data = Form( 

109 raw_tags=[table_header] + raw_tags 

110 if table_header != "" 

111 else raw_tags, 

112 source="inflection table", 

113 form=form, 

114 hiragana=hiragana or "", 

115 roman=roman if roman not in [None, "", "-"] else "", 

116 ) 

117 if small_tag is not None: 

118 form_data.raw_tags.append(small_tag) 

119 translate_raw_tags(form_data) 

120 page_data[-1].forms.append(form_data) 

121 

122 for form_data in page_data[-1].forms: 

123 if form_data.source == "inflection table": 123 ↛ 122line 123 didn't jump to line 122 because the condition on line 123 was always true

124 for index, raw_tag in enumerate(form_data.raw_tags): 

125 if raw_tag in small_tags_dict: 

126 form_data.raw_tags[index] = small_tags_dict[raw_tag] 

127 translate_raw_tags(form_data)