Coverage for src/wiktextract/extractor/en/info_templates.py: 84%

64 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1"""Handle specific templates, starting with `Template:+obj`. 

2Each template should have a properly annotated function 

3associated with its name in info_templates, and it should 

4handle the given node based on where it is called from 

5(`location`), like head or sense. 

6""" 

7 

8import re 

9from typing import Callable, Optional, Union 

10 

11from wikitextprocessor import TemplateNode, WikiNode 

12from wikitextprocessor.core import TemplateArgs 

13 

14from ...clean import clean_template_args, clean_value 

15from ...wxr_context import WiktextractContext 

16from .form_descriptions import decode_tags 

17from .type_utils import PlusObjTemplateData, TemplateData 

18 

19InfoNode = Union[str, WikiNode] 

20 

21InfoReturnTuple = tuple[ 

22 Optional[TemplateData], # template data field contents or None 

23 Optional[Union[str, WikiNode]], # template output or None if it should 

24 # not be expressed (like `+obj` in heads). Return the original 

25 # WikiNode if nothing special needs to happen. 

26] 

27 

28 

29InfoTemplateFunc = Callable[ 

30 [ 

31 WiktextractContext, 

32 InfoNode, # the node being checked 

33 str, # location from where this is called 

34 ], 

35 InfoReturnTuple, 

36] 

37 

38PLUSOBJ_RE = re.compile(r"\[with ([^][=]+)( = ([^][]+))?\]") 

39 

40 

41def plusobj_func( 

42 wxr: WiktextractContext, node: InfoNode, loc: str 

43) -> InfoReturnTuple: 

44 """Parse the output of Template:+obj, 

45 `[with infinitive or ergative = meaning]`""" 

46 

47 if not isinstance(node, TemplateNode): 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 wxr.wtp.error( 

49 "INFO-TEMPLATES: plusobj_func: node is not a TemplateNode", 

50 sortid="info_templates/45", 

51 ) 

52 return None, None 

53 

54 text = clean_value(wxr, wxr.wtp.expand(wxr.wtp.node_to_wikitext(node))) 

55 # print(f"cleaned: {text=}") 

56 m = PLUSOBJ_RE.search(text) 

57 if not m: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 wxr.wtp.error( 

59 f"INFO-TEMPLATES: `Template:+obj` expansion does not " 

60 f"match regex: {text}", 

61 sortid="info_templates/78", 

62 ) 

63 return None, None 

64 taggers = m.group(1) 

65 meaning = m.group(3) 

66 

67 extra_data: PlusObjTemplateData = {"words": [], "tags": []} 

68 if meaning: 68 ↛ 71line 68 didn't jump to line 71 because the condition on line 68 was always true

69 extra_data["meaning"] = meaning 

70 

71 for ortags in re.split(r",| or ", taggers): 

72 tagsets, _ = decode_tags(ortags) 

73 for tagset in tagsets: 

74 if "error-unknown-tag" in tagset: 

75 extra_data["words"].extend(ortags.split()) 

76 else: 

77 extra_data["tags"].extend(tagset) 

78 if not extra_data["words"]: 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 del extra_data["words"] 

80 if not extra_data["tags"]: 80 ↛ 81line 80 didn't jump to line 81 because the condition on line 80 was never true

81 del extra_data["tags"] 

82 else: 

83 extra_data["tags"] = sorted(set(extra_data["tags"])) 

84 

85 ret_template_data: TemplateData = { 

86 "args": clean_template_args(wxr, node.template_parameters), 

87 "name": "+obj", 

88 "extra_data": extra_data, 

89 "expansion": text, 

90 } 

91 

92 if loc == "head": 

93 return ret_template_data, None 

94 

95 # if "sense", keep text in sense 

96 return ret_template_data, text 

97 

98 

99INFO_TEMPLATE_FUNCS: dict[str, InfoTemplateFunc] = { 

100 "+obj": plusobj_func, 

101} 

102 

103 

104def parse_info_template_node( 

105 wxr: WiktextractContext, node: Union[str, WikiNode], loc: str 

106) -> InfoReturnTuple: 

107 if not isinstance(node, WikiNode): 

108 return None, None 

109 if ( 

110 not isinstance(node, TemplateNode) 

111 or node.template_name not in INFO_TEMPLATE_FUNCS 

112 ): 

113 return None, None 

114 

115 return INFO_TEMPLATE_FUNCS[node.template_name](wxr, node, loc) 

116 

117 

118def parse_info_template_arguments( 

119 wxr: WiktextractContext, name: str, args: TemplateArgs, loc: str 

120) -> InfoReturnTuple: 

121 templ_s = "{{" + name 

122 zipped = [(str(k), v) for k, v in args.items()] 

123 for k, v in sorted(zipped): 

124 if k.isnumeric(): 

125 templ_s += f"|{v}" 

126 else: 

127 templ_s += f"|{k}={v}" 

128 templ_s += "}}" 

129 templ_node = wxr.wtp.parse(templ_s) 

130 if len(templ_node.children) > 0: 130 ↛ 136line 130 didn't jump to line 136 because the condition on line 130 was always true

131 tnode = wxr.wtp.parse(templ_s).children[0] 

132 if not isinstance(tnode, WikiNode): 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 return None, None 

134 templ_node = tnode 

135 else: 

136 return None, None 

137 

138 return parse_info_template_node( 

139 wxr, 

140 templ_node, 

141 "sense", 

142 )