Coverage for src / wiktextract / extractor / zh / inflection.py: 92%

80 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from itertools import zip_longest 

3 

4from wikitextprocessor import NodeKind, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .tags import translate_raw_tags 

10 

11# https://zh.wiktionary.org/wiki/Category:日語變格表模板 

12JAPANESE_INFLECTION_TEMPLATE_PREFIXES = ( 

13 "ja-i", 

14 "ja-adj-infl", 

15 "ja-conj-bungo", 

16 "ja-go-", 

17 "ja-honorific", 

18 "ja-ichi", 

19 "ja-kuru", 

20 "ja-suru", 

21 "ja-verbconj", 

22 "ja-zuru", 

23 "ja-na", 

24) 

25 

26 

27def extract_inflections( 

28 wxr: WiktextractContext, 

29 page_data: list[WordEntry], 

30 level_node: WikiNode, 

31) -> None: 

32 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

33 if t_node.template_name.lower().startswith( 33 ↛ 32line 33 didn't jump to line 32 because the condition on line 33 was always true

34 JAPANESE_INFLECTION_TEMPLATE_PREFIXES 

35 ): 

36 expanded_template = wxr.wtp.parse( 

37 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

38 ) 

39 for table_node in expanded_template.find_child_recursively( 

40 NodeKind.TABLE 

41 ): 

42 extract_ja_inf_table(wxr, page_data, table_node) 

43 

44 

45def extract_ja_inf_table( 

46 wxr: WiktextractContext, 

47 page_data: list[WordEntry], 

48 table_node: WikiNode, 

49) -> None: 

50 table_header = "" 

51 small_tags_dict = {} 

52 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

53 if len(list(row_node.filter_empty_str_child())) == 1: 

54 has_small_tag = False 

55 # table end tags 

56 for small_tag in row_node.find_html_recursively("small"): 

57 has_small_tag = True 

58 for line in clean_node(wxr, None, small_tag).splitlines(): 

59 m = re.match(r"(¹|²|\^\(\[\d+\]\))", line) 

60 if m is not None: 60 ↛ 58line 60 didn't jump to line 58 because the condition on line 60 was always true

61 small_tags_dict[line[: m.end()]] = line[ 

62 m.end() : 

63 ].strip() 

64 if not has_small_tag: 

65 table_header = clean_node(wxr, None, row_node.children) 

66 else: 

67 form_list = [] 

68 hiragana_list = [] 

69 roman_list = [] 

70 raw_tags = [] 

71 small_tags = [] 

72 cell_node_index = 0 

73 for row_child in row_node.find_child( 

74 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

75 ): 

76 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

77 for line in clean_node(wxr, None, row_child).splitlines(): 

78 line = line.strip("() ") 

79 if len(line) > 0: 79 ↛ 77line 79 didn't jump to line 77 because the condition on line 79 was always true

80 raw_tags.append(line) 

81 elif row_child.kind == NodeKind.TABLE_CELL: 81 ↛ 73line 81 didn't jump to line 73 because the condition on line 81 was always true

82 if cell_node_index >= 3: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 break 

84 for bold_node in row_child.find_child(NodeKind.BOLD): 

85 # is row header 

86 raw_tags.append(clean_node(wxr, None, bold_node)) 

87 continue 

88 for span_tag in row_child.find_html("span"): 

89 span_text = clean_node(wxr, None, row_child) 

90 span_class = span_tag.attrs.get("class", "") 

91 for line in span_text.splitlines(): 

92 if line == "-": 

93 continue 

94 m = re.search(r"(¹|²|\^\(\[\d+\]\))$", line) 

95 if m is not None: 

96 if cell_node_index == 0: 

97 small_tags.append(m.group(1)) 

98 line = line[: m.start(1)] 

99 if span_class == "Latn": 

100 roman_list.append(line) 

101 elif span_class == "Jpan": 101 ↛ 91line 101 didn't jump to line 91 because the condition on line 101 was always true

102 if cell_node_index == 0: 

103 form_list.append(line) 

104 elif cell_node_index == 1: 104 ↛ 91line 104 didn't jump to line 91 because the condition on line 104 was always true

105 hiragana_list.append(line) 

106 cell_node_index += 1 

107 break 

108 

109 for form, hiragana, roman, small_tag in zip_longest( 

110 form_list, hiragana_list, roman_list, small_tags 

111 ): 

112 if form in [None, "", "-", wxr.wtp.title]: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 continue 

114 form_data = Form( 

115 raw_tags=[table_header] + raw_tags 

116 if table_header != "" 

117 else raw_tags, 

118 source="inflection table", 

119 form=form, 

120 hiragana=hiragana or "", 

121 roman=roman if roman not in [None, "", "-"] else "", 

122 ) 

123 if small_tag is not None: 

124 form_data.raw_tags.append(small_tag) 

125 translate_raw_tags(form_data) 

126 page_data[-1].forms.append(form_data) 

127 

128 for form_data in page_data[-1].forms: 

129 if form_data.source == "inflection table": 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true

130 for index, raw_tag in enumerate(form_data.raw_tags): 

131 if raw_tag in small_tags_dict: 

132 form_data.raw_tags[index] = small_tags_dict[raw_tag] 

133 translate_raw_tags(form_data)