Coverage for src/wiktextract/extractor/en/analyze_template.py: 89%

61 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import Page, Wtp 

5 

6 

7def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]: 

8 """Analyzes a template body and returns a set of the canonicalized 

9 names of all other templates it calls and a boolean that is True 

10 if it should be pre-expanded before final parsing and False if it 

11 need not be pre-expanded. The pre-expanded flag is determined 

12 based on that body only; the caller should propagate it to 

13 templates that include the given template. This does not work for 

14 template and template function calls where the name is generated by 

15 other expansions.""" 

16 if page.redirect_to is not None: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 return set(), False 

18 included_templates: set[str] = set() 

19 

20 # Determine if the template starts with a list item 

21 # XXX should we expand other templates that produce list items??? 

22 contains_list = page.body.startswith(("#", "*", ";", ":")) 

23 

24 # Remove paired tables. 

25 # What is left is unpaired tables, which is an indication that a 

26 # template somewhere should be generating those table eventually, 

27 # and thus needs to be pre-expanded. 

28 table_start_pos = [] 

29 table_end_pos = [] 

30 # `[[wikt:/|}]]` in Template:Mon standard keyboard 

31 # and `{{l|mul|} }}` in Template:punctuation are not end of table token 

32 # but `|}]]` in Template:Lithuania map is a table 

33 for m in re.finditer( 

34 r""" 

35 (?<!{){\| # `{|` not after `{`, like `{{{|}}}` 

36 | 

37 \|}(?!\s*}) # `|}` not before ` }` 

38 """, 

39 page.body, 

40 re.VERBOSE, 

41 ): 

42 if m.group() == "{|": 

43 table_start_pos.append(m.start()) 

44 else: 

45 table_end_pos.append(m.end()) 

46 num_table_start = len(table_start_pos) 

47 num_table_end = len(table_end_pos) 

48 contains_unpaired_table = num_table_start != num_table_end 

49 table_start = len(page.body) 

50 table_end = table_start 

51 if num_table_start > num_table_end and num_table_end > 0: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 table_start = table_start_pos[num_table_start - num_table_end - 1] 

53 table_end = table_end_pos[-1] 

54 elif num_table_start < num_table_end and num_table_start > 0: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 table_start = table_start_pos[0] 

56 table_end = table_end_pos[num_table_start] 

57 elif num_table_start > 0 and num_table_end > 0: 

58 table_start = table_start_pos[0] 

59 table_end = table_end_pos[-1] 

60 unpaired_text = page.body[:table_start] + page.body[table_end:] 

61 

62 # Determine if the template contains table element tokens 

63 # outside paired table start/end. We only try to look for 

64 # these outside templates, as it is common to write each 

65 # template argument on its own line starting with a "|". 

66 outside = unpaired_text 

67 while True: 

68 # print("=== OUTSIDE ITER") 

69 prev = outside 

70 

71 # handle {{{ }}} parameters without templates inside them 

72 while True: 

73 newt = re.sub( 

74 # re.X, ignore white space and comments 

75 r"""(?sx)\{\{\{ # {{{ 

76 ( [^{}] # no {} except... 

77 | \}[^}] # no }} unless... 

78 | \}\}[^}] # they're definitely not }}} 

79 )*? 

80 \}\}\} # }}} 

81 """, 

82 "", 

83 prev, 

84 ) 

85 if newt == prev: 

86 break 

87 prev = newt 

88 # print("After arg elim: {!r}".format(newt)) 

89 

90 # Handle templates 

91 newt = re.sub( 

92 r"""(?sx)\{\{ 

93 ( [^{}] 

94 | \}[^}] 

95 )*? 

96 \}\}""", 

97 "", 

98 newt, 

99 ) 

100 # print("After templ elim: {!r}".format(newt)) 

101 if newt == outside: 

102 break 

103 outside = newt 

104 # Check if the template contains certain table elements 

105 # start of line plus |+, |- or |! 

106 m = re.search(r"(?s)(^|\n)(\|\+|\|-|\!)", outside) 

107 m2 = re.match(r"(?si)\s*(<includeonly>|<!--.*?-->)(\|\||!!)", outside) 

108 contains_table_element = m is not None or m2 is not None 

109 # if contains_table_element: 

110 # print("contains_table_element {!r} at {}" 

111 # .format(m.group(0), m.start())) 

112 # print("... {!r} ...".format(outside[m.start() - 10:m.end() + 10])) 

113 # print(repr(outside)) 

114 

115 # Check for unpaired HTML tags 

116 tag_cnts: defaultdict[str, int] = defaultdict(int) 

117 for m in re.finditer( 

118 r"(?si)<(/)?({})\b\s*[^>]*(/)?>" r"".format( 

119 "|".join(wtp.paired_html_tags) 

120 ), 

121 outside, 

122 ): 

123 start_slash = m.group(1) 

124 tagname = m.group(2) 

125 end_slash = m.group(3) 

126 if start_slash: 

127 tag_cnts[tagname] -= 1 

128 elif not end_slash: 128 ↛ 117line 128 didn't jump to line 117 because the condition on line 128 was always true

129 tag_cnts[tagname] += 1 

130 contains_unbalanced_html = any(v != 0 for v in tag_cnts.values()) 

131 # if contains_unbalanced_html: 

132 # print(name, "UNBALANCED HTML") 

133 # for k, v in tag_cnts.items(): 

134 # if v != 0: 

135 # print(" {} {}".format(v, k)) 

136 

137 # Determine which other templates are called from unpaired text. 

138 # None of the flags we currently gather propagate outside a paired 

139 # table start/end. 

140 for m in re.finditer( 

141 # capture the first parameter of a template, ie. the name 

142 r"""(?sx)(^ | [^{]) # start 

143 (\{\{)?\{\{([^{]*?) # ( ({{) {{ (name) ) 

144 (\| | \}\}) # | or }}""", 

145 unpaired_text, 

146 ): 

147 called_template = m.group(3) 

148 called_template = re.sub(r"(?si)<nowiki\s*/>", "", called_template) 

149 if len(called_template) > 0: 149 ↛ 140line 149 didn't jump to line 140 because the condition on line 149 was always true

150 included_templates.add(called_template) 

151 

152 # Determine whether this template should be pre-expanded 

153 pre_expand = ( 

154 contains_list 

155 or contains_unpaired_table 

156 or contains_table_element 

157 or contains_unbalanced_html 

158 ) 

159 

160 return included_templates, pre_expand