Coverage for src/wiktextract/extractor/zh/inflection.py: 92%

1from itertools import zip_longest

3from wikitextprocessor import NodeKind, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, WordEntry

8from .tags import translate_raw_tags

10# https://zh.wiktionary.org/wiki/Category:日語變格表模板

11JAPANESE_INFLECTION_TEMPLATE_PREFIXES = (

12 "ja-i",

13 "ja-adj-infl",

14 "ja-conj-bungo",

15 "ja-go",

16 "ja-honorific",

17 "ja-ichi",

18 "ja-kuru",

19 "ja-suru",

20 "ja-verbconj",

21 "ja-zuru",

22)

25def extract_inflections(

26 wxr: WiktextractContext,

27 page_data: list[WordEntry],

28 level_node: WikiNode,

29) -> None:

30 for t_node in level_node.find_child(NodeKind.TEMPLATE):

31 if t_node.template_name.lower().startswith( 31 ↛ 30line 31 didn't jump to line 30 because the condition on line 31 was always true

32 JAPANESE_INFLECTION_TEMPLATE_PREFIXES

33 ):

34 expanded_template = wxr.wtp.parse(

35 wxr.wtp.node_to_wikitext(t_node), expand_all=True

36 )

37 for table_node in expanded_template.find_child_recursively(

38 NodeKind.TABLE

39 ):

40 extract_ja_inf_table(wxr, page_data, table_node)

43def extract_ja_inf_table(

44 wxr: WiktextractContext,

45 page_data: list[WordEntry],

46 table_node: WikiNode,

47) -> None:

48 table_header = ""

49 small_tags_dict = {}

50 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

51 if len(list(row_node.filter_empty_str_child())) == 1:

52 has_small_tag = False

53 # table end tags

54 for small_tag in row_node.find_html_recursively("small"):

55 has_small_tag = True

56 tag_text = clean_node(wxr, None, small_tag)

57 if tag_text.startswith(("¹", "²")): 57 ↛ 54line 57 didn't jump to line 54 because the condition on line 57 was always true

58 small_tags_dict[tag_text[0]] = tag_text[1:].strip()

59 if not has_small_tag:

60 table_header = clean_node(wxr, None, row_node.children)

61 else:

62 form_list = []

63 hiragana_list = []

64 roman_list = []

65 raw_tags = []

66 small_tags = []

67 cell_node_index = 0

68 for row_child in row_node.find_child(

69 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

70 ):

71 if row_child.kind == NodeKind.TABLE_HEADER_CELL:

72 for line in clean_node(wxr, None, row_child).splitlines():

73 line = line.strip("（） ")

74 if len(line) > 0: 74 ↛ 72line 74 didn't jump to line 72 because the condition on line 74 was always true

75 raw_tags.append(line)

76 elif row_child.kind == NodeKind.TABLE_CELL: 76 ↛ 68line 76 didn't jump to line 68 because the condition on line 76 was always true

77 if cell_node_index >= 3: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 break

79 for bold_node in row_child.find_child(NodeKind.BOLD):

80 # is row header

81 raw_tags.append(clean_node(wxr, None, bold_node))

82 continue

83 for span_tag in row_child.find_html("span"):

84 span_text = clean_node(wxr, None, row_child)

85 span_class = span_tag.attrs.get("class", "")

86 for line in span_text.splitlines():

87 if line == "-":

88 continue

89 if line.endswith(("¹", "²")):

90 if cell_node_index == 0:

91 small_tags.append(line[-1])

92 line = line[:-1]

93 if span_class == "Latn":

94 roman_list.append(line)

95 elif span_class == "Jpan": 95 ↛ 86line 95 didn't jump to line 86 because the condition on line 95 was always true

96 if cell_node_index == 0:

97 form_list.append(line)

98 elif cell_node_index == 1: 98 ↛ 86line 98 didn't jump to line 86 because the condition on line 98 was always true

99 hiragana_list.append(line)

100 cell_node_index += 1

101 break

102

103 for form, hiragana, roman, small_tag in zip_longest(

104 form_list, hiragana_list, roman_list, small_tags

105 ):

106 if form in [None, "", "－", wxr.wtp.title]: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 continue

108 form_data = Form(

109 raw_tags=[table_header] + raw_tags

110 if table_header != ""

111 else raw_tags,

112 source="inflection table",

113 form=form,

114 hiragana=hiragana or "",

115 roman=roman if roman not in [None, "", "－"] else "",

116 )

117 if small_tag is not None:

118 form_data.raw_tags.append(small_tag)

119 translate_raw_tags(form_data)

120 page_data[-1].forms.append(form_data)

121

122 for form_data in page_data[-1].forms:

123 if form_data.source == "inflection table": 123 ↛ 122line 123 didn't jump to line 122 because the condition on line 123 was always true

124 for index, raw_tag in enumerate(form_data.raw_tags):

125 if raw_tag in small_tags_dict:

126 form_data.raw_tags[index] = small_tags_dict[raw_tag]

127 translate_raw_tags(form_data)