Coverage for src/wiktextract/extractor/en/analyze

1import re

2from collections import defaultdict

4from wikitextprocessor import Page, Wtp

7def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]:

8 """Analyzes a template body and returns a set of the canonicalized

9 names of all other templates it calls and a boolean that is True

10 if it should be pre-expanded before final parsing and False if it

11 need not be pre-expanded. The pre-expanded flag is determined

12 based on that body only; the caller should propagate it to

13 templates that include the given template. This does not work for

14 template and template function calls where the name is generated by

15 other expansions."""

16 if page.redirect_to is not None or page.body is None: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true

17 return set(), False

18 included_templates: set[str] = set()

20 # Determine if the template starts with a list item

21 # XXX should we expand other templates that produce list items???

22 contains_list = page.body.startswith(("#", "*", ";", ":"))

24 # Remove paired tables.

25 # What is left is unpaired tables, which is an indication that a

26 # template somewhere should be generating those table eventually,

27 # and thus needs to be pre-expanded.

28 table_start_pos = []

29 table_end_pos = []

30 # `[[wikt:/|}]]` in Template:Mon standard keyboard

31 # and `{{l|mul|} }}` in Template:punctuation are not end of table token

32 # but `|}]]` in Template:Lithuania map is a table

33 for m in re.finditer(

34 r"""

35 (?<!{){\| # `{|` not after `{`, like `{{{|}}}`

36 |

37 \|}(?!\s*}) # `|}` not before ` }`

38 """,

39 page.body,

40 re.VERBOSE,

41 ):

42 if m.group() == "{|":

43 table_start_pos.append(m.start())

44 else:

45 table_end_pos.append(m.end())

46 num_table_start = len(table_start_pos)

47 num_table_end = len(table_end_pos)

48 contains_unpaired_table = num_table_start != num_table_end

49 table_start = len(page.body)

50 table_end = table_start

51 if num_table_start > num_table_end and num_table_end > 0: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 table_start = table_start_pos[num_table_start - num_table_end - 1]

53 table_end = table_end_pos[-1]

54 elif num_table_start < num_table_end and num_table_start > 0: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 table_start = table_start_pos[0]

56 table_end = table_end_pos[num_table_start]

57 elif num_table_start > 0 and num_table_end > 0:

58 table_start = table_start_pos[0]

59 table_end = table_end_pos[-1]

60 unpaired_text = page.body[:table_start] + page.body[table_end:]

62 # Determine if the template contains table element tokens

63 # outside paired table start/end. We only try to look for

64 # these outside templates, as it is common to write each

65 # template argument on its own line starting with a "|".

66 outside = unpaired_text

67 while True:

68 # print("=== OUTSIDE ITER")

69 prev = outside

71 # handle {{{ }}} parameters without templates inside them

72 while True:

73 newt = re.sub(

74 # re.X, ignore white space and comments

75 r"""(?sx)\{\{\{ # {{{

76 ( [^{}] # no {} except...

77 | \}[^}] # no }} unless...

78 | \}\}[^}] # they're definitely not }}}

79 )*?

80 \}\}\} # }}}

81 """,

82 "",

83 prev,

84 )

85 if newt == prev:

86 break

87 prev = newt

88 # print("After arg elim: {!r}".format(newt))

90 # Handle templates

91 newt = re.sub(

92 r"""(?sx)\{\{

93 ( [^{}]

94 | \}[^}]

95 )*?

96 \}\}""",

97 "",

98 newt,

99 )

100 # print("After templ elim: {!r}".format(newt))

101 if newt == outside:

102 break

103 outside = newt

104 # Check if the template contains certain table elements

105 # start of line plus |+, |- or |!

106 m = re.search(r"(?s)(^|\n)(\|\+|\|-|\!)", outside)

107 m2 = re.match(r"(?si)\s*(<includeonly>|)(\|\||!!)", outside)

108 contains_table_element = m is not None or m2 is not None

109 # if contains_table_element:

110 # print("contains_table_element {!r} at {}"

111 # .format(m.group(0), m.start()))

112 # print("... {!r} ...".format(outside[m.start() - 10:m.end() + 10]))

113 # print(repr(outside))

114

115 # Check for unpaired HTML tags

116 tag_cnts: defaultdict[str, int] = defaultdict(int)

117 for m in re.finditer(

118 r"(?si)<(/)?({})\b\s*[^>]*(/)?>" r"".format(

119 "|".join(wtp.paired_html_tags)

120 ),

121 outside,

122 ):

123 start_slash = m.group(1)

124 tagname = m.group(2)

125 end_slash = m.group(3)

126 if start_slash:

127 tag_cnts[tagname] -= 1

128 elif not end_slash: 128 ↛ 117line 128 didn't jump to line 117 because the condition on line 128 was always true

129 tag_cnts[tagname] += 1

130 contains_unbalanced_html = any(v != 0 for v in tag_cnts.values())

131 # if contains_unbalanced_html:

132 # print(name, "UNBALANCED HTML")

133 # for k, v in tag_cnts.items():

134 # if v != 0:

135 # print(" {} {}".format(v, k))

136

137 # Determine which other templates are called from unpaired text.

138 # None of the flags we currently gather propagate outside a paired

139 # table start/end.

140 for m in re.finditer(

141 # capture the first parameter of a template, ie. the name

142 r"""(?sx)(^ | [^{]) # start

143 (\{\{)?\{\{([^{]*?) # ( ({{) {{ (name) )

144 (\| | \}\}) # | or }}""",

145 unpaired_text,

146 ):

147 called_template = m.group(3)

148 called_template = re.sub(r"(?si)<nowiki\s*/>", "", called_template)

149 if len(called_template) > 0: 149 ↛ 140line 149 didn't jump to line 140 because the condition on line 149 was always true

150 included_templates.add(called_template)

151

152 # Determine whether this template should be pre-expanded

153 pre_expand = (

154 contains_list

155 or contains_unpaired_table

156 or contains_table_element

157 or contains_unbalanced_html

158 )

159

160 return included_templates, pre_expand

Coverage for src/wiktextract/extractor/en/analyze_template.py: 89%

61 statements