Coverage for src/wiktextract/extractor/ja/pos.py: 92%

77 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from .example import extract_example_list_item 

7from .header import extract_header_nodes 

8from .models import AltForm, Sense, WordEntry 

9from .section_titles import POS_DATA 

10from .tags import translate_raw_tags 

11 

12 

13def parse_pos_section( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 base_data: WordEntry, 

17 level_node: LevelNode, 

18 pos_title: str, 

19) -> None: 

20 page_data.append(base_data.model_copy(deep=True)) 

21 page_data[-1].pos_title = pos_title 

22 pos_data = POS_DATA[pos_title] 

23 page_data[-1].pos = pos_data["pos"] 

24 page_data[-1].tags.extend(pos_data.get("tags", [])) 

25 

26 gloss_list_start = 0 

27 for list_index, list_node in level_node.find_child(NodeKind.LIST, True): 

28 if not list_node.sarg.endswith("#"): # linkage list 

29 continue 

30 if gloss_list_start == 0: 30 ↛ 32line 30 didn't jump to line 32 because the condition on line 30 was always true

31 gloss_list_start = list_index 

32 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

33 process_gloss_list_item(wxr, page_data[-1], list_item) 

34 extract_header_nodes( 

35 wxr, page_data[-1], level_node.children[:gloss_list_start] 

36 ) 

37 if gloss_list_start == 0: 

38 page_data.pop() 

39 

40 

41def process_gloss_list_item( 

42 wxr: WiktextractContext, 

43 word_entry: WordEntry, 

44 list_item_node: WikiNode, 

45 parent_gloss: str = "", 

46) -> None: 

47 gloss_nodes = list( 

48 list_item_node.invert_find_child(NodeKind.LIST, include_empty_str=True) 

49 ) 

50 sense_data = Sense() 

51 find_form_of_data(wxr, word_entry, sense_data, list_item_node) 

52 if len(parent_gloss) > 0: 

53 sense_data.glosses.append(parent_gloss) 

54 gloss_only_nodes = [] 

55 for gloss_node in gloss_nodes: 

56 if isinstance(gloss_node, TemplateNode): 

57 if gloss_node.template_name in ("context", "タグ"): 

58 # https://ja.wiktionary.org/wiki/テンプレート:context 

59 # https://ja.wiktionary.org/wiki/テンプレート:タグ 

60 for raw_tag in ( 

61 clean_node(wxr, sense_data, gloss_node) 

62 .strip("()") 

63 .split(",") 

64 ): 

65 raw_tag = raw_tag.strip() 

66 if len(raw_tag) > 0: 66 ↛ 60line 66 didn't jump to line 60

67 sense_data.raw_tags.append(raw_tag) 

68 elif gloss_node.template_name == "wikipedia-s": 

69 expanded_text = clean_node(wxr, None, gloss_node) 

70 gloss_only_nodes.append( 

71 expanded_text.removesuffix("⁽ʷᵖ⁾").strip() 

72 ) 

73 elif gloss_node.template_name == "wp": 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 continue 

75 else: 

76 gloss_only_nodes.append(gloss_node) 

77 else: 

78 gloss_only_nodes.append(gloss_node) 

79 expanded_gloss = wxr.wtp.parse( 

80 wxr.wtp.node_to_wikitext(gloss_only_nodes), expand_all=True 

81 ) 

82 ruby, no_ruby = extract_ruby(wxr, expanded_gloss.children) 

83 gloss_text = clean_node(wxr, sense_data, no_ruby) 

84 sense_data.ruby = ruby 

85 if len(gloss_text) > 0: 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true

86 sense_data.glosses.append(gloss_text) 

87 translate_raw_tags(sense_data) 

88 word_entry.senses.append(sense_data) 

89 

90 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

91 if nest_gloss_list.sarg.endswith(("*", ":")): 

92 for example_list_item in nest_gloss_list.find_child( 

93 NodeKind.LIST_ITEM 

94 ): 

95 extract_example_list_item( 

96 wxr, word_entry, sense_data, example_list_item 

97 ) 

98 elif nest_gloss_list.sarg.endswith("#"): 98 ↛ 90line 98 didn't jump to line 90 because the condition on line 98 was always true

99 for nest_list_item in nest_gloss_list.find_child( 

100 NodeKind.LIST_ITEM 

101 ): 

102 process_gloss_list_item( 

103 wxr, word_entry, nest_list_item, gloss_text 

104 ) 

105 

106 

107def find_form_of_data( 

108 wxr: WiktextractContext, 

109 word_entry: WordEntry, 

110 sense: Sense, 

111 list_item_node: WikiNode, 

112) -> None: 

113 for node in list_item_node.find_child(NodeKind.TEMPLATE): 

114 if node.template_name.endswith(" of"): 

115 expanded_node = wxr.wtp.parse( 

116 wxr.wtp.node_to_wikitext(node), expand_all=True 

117 ) 

118 for link_node in expanded_node.find_child_recursively( 118 ↛ 113line 118 didn't jump to line 113 because the loop on line 118 didn't complete

119 NodeKind.LINK 

120 ): 

121 form_of = clean_node(wxr, None, link_node) 

122 if form_of != "": 122 ↛ 118line 122 didn't jump to line 118 because the condition on line 122 was always true

123 sense.form_of.append(AltForm(word=form_of)) 

124 break 

125 if "form-of" in word_entry.tags and len(sense.form_of) == 0: 

126 for link_node in list_item_node.find_child(NodeKind.LINK): 126 ↛ exitline 126 didn't return from function 'find_form_of_data' because the loop on line 126 didn't complete

127 form_of = clean_node(wxr, None, link_node) 

128 if form_of != "": 128 ↛ 126line 128 didn't jump to line 126 because the condition on line 128 was always true

129 sense.form_of.append(AltForm(word=form_of)) 

130 break