Coverage for src/wiktextract/extractor/zh/inflection.py: 92%

75 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from itertools import zip_longest 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8 

9# https://zh.wiktionary.org/wiki/Category:日語變格表模板 

10JAPANESE_INFLECTION_TEMPLATE_PREFIXES = ( 

11 "ja-i", 

12 "ja-adj-infl", 

13 "ja-conj-bungo", 

14 "ja-go", 

15 "ja-honorific", 

16 "ja-ichi", 

17 "ja-kuru", 

18 "ja-suru", 

19 "ja-verbconj", 

20 "ja-zuru", 

21) 

22 

23 

24def extract_inflections( 

25 wxr: WiktextractContext, 

26 page_data: list[WordEntry], 

27 level_node: WikiNode, 

28) -> None: 

29 for child in level_node.find_child(NodeKind.TEMPLATE): 

30 template_name = child.template_name.lower() 

31 if template_name.startswith(JAPANESE_INFLECTION_TEMPLATE_PREFIXES): 31 ↛ 29line 31 didn't jump to line 29 because the condition on line 31 was always true

32 expanded_template = wxr.wtp.parse( 

33 wxr.wtp.node_to_wikitext(level_node), expand_all=True 

34 ) 

35 for table_node in expanded_template.find_child_recursively( 

36 NodeKind.TABLE 

37 ): 

38 extract_ja_inf_table(wxr, page_data, table_node) 

39 

40 

41def extract_ja_inf_table( 

42 wxr: WiktextractContext, 

43 page_data: list[WordEntry], 

44 table_node: WikiNode, 

45) -> None: 

46 table_header = [] 

47 small_tags_dict = {} 

48 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

49 if len(list(row_node.filter_empty_str_child())) == 1: 

50 has_small_tag = False 

51 # table end tags 

52 for small_tag in row_node.find_html_recursively("small"): 

53 has_small_tag = True 

54 tag_text = clean_node(wxr, None, small_tag) 

55 if tag_text.startswith(("¹", "²")): 55 ↛ 52line 55 didn't jump to line 52 because the condition on line 55 was always true

56 small_tags_dict[tag_text[0]] = tag_text[1:].strip() 

57 if not has_small_tag: 

58 table_header = clean_node(wxr, None, row_node.children) 

59 else: 

60 form_list = [] 

61 hiragana_list = [] 

62 roman_list = [] 

63 raw_tags = [] 

64 small_tags = [] 

65 cell_node_index = 0 

66 for row_child in row_node.find_child( 

67 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

68 ): 

69 if row_child.kind == NodeKind.TABLE_HEADER_CELL: 

70 for line in clean_node(wxr, None, row_child).splitlines(): 

71 line = line.strip("() ") 

72 if len(line) > 0: 72 ↛ 70line 72 didn't jump to line 70 because the condition on line 72 was always true

73 raw_tags.append(line) 

74 elif row_child.kind == NodeKind.TABLE_CELL: 74 ↛ 66line 74 didn't jump to line 66 because the condition on line 74 was always true

75 if cell_node_index >= 3: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 break 

77 for bold_node in row_child.find_child(NodeKind.BOLD): 

78 # is row header 

79 raw_tags.append(clean_node(wxr, None, bold_node)) 

80 continue 

81 for span_tag in row_child.find_html("span"): 

82 span_text = clean_node(wxr, None, row_child) 

83 span_class = span_tag.attrs.get("class", "") 

84 for line in span_text.splitlines(): 

85 if line == "-": 

86 continue 

87 if line.endswith(("¹", "²")): 

88 if cell_node_index == 0: 

89 small_tags.append(line[-1]) 

90 line = line[:-1] 

91 if span_class == "Latn": 

92 roman_list.append(line) 

93 elif span_class == "Jpan": 93 ↛ 84line 93 didn't jump to line 84 because the condition on line 93 was always true

94 if cell_node_index == 0: 

95 form_list.append(line) 

96 elif cell_node_index == 1: 96 ↛ 84line 96 didn't jump to line 84 because the condition on line 96 was always true

97 hiragana_list.append(line) 

98 cell_node_index += 1 

99 break 

100 

101 for form, hiragana, roman, small_tag in zip_longest( 

102 form_list, hiragana_list, roman_list, small_tags 

103 ): 

104 if form is None: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 continue 

106 form_data = Form( 

107 raw_tags=[table_header] + raw_tags, 

108 source="inflection table", 

109 form=form, 

110 hiragana=hiragana or "", 

111 roman=roman or "", 

112 ) 

113 if small_tag is not None: 

114 form_data.raw_tags.append(small_tag) 

115 page_data[-1].forms.append(form_data) 

116 

117 for form_data in page_data[-1].forms: 

118 if form_data.source == "inflection table": 118 ↛ 117line 118 didn't jump to line 117 because the condition on line 118 was always true

119 for index, raw_tag in enumerate(form_data.raw_tags): 

120 if raw_tag in small_tags_dict: 

121 form_data.raw_tags[index] = small_tags_dict[raw_tag]