Coverage for src/wiktextract/extractor/en/analyze_template.py: 89%
61 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import Page, Wtp
7def analyze_template(wtp: Wtp, page: Page) -> tuple[set[str], bool]:
8 """Analyzes a template body and returns a set of the canonicalized
9 names of all other templates it calls and a boolean that is True
10 if it should be pre-expanded before final parsing and False if it
11 need not be pre-expanded. The pre-expanded flag is determined
12 based on that body only; the caller should propagate it to
13 templates that include the given template. This does not work for
14 template and template function calls where the name is generated by
15 other expansions."""
16 if page.redirect_to is not None: 16 ↛ 17line 16 didn't jump to line 17 because the condition on line 16 was never true
17 return set(), False
18 included_templates: set[str] = set()
20 # Determine if the template starts with a list item
21 # XXX should we expand other templates that produce list items???
22 contains_list = page.body.startswith(("#", "*", ";", ":"))
24 # Remove paired tables.
25 # What is left is unpaired tables, which is an indication that a
26 # template somewhere should be generating those table eventually,
27 # and thus needs to be pre-expanded.
28 table_start_pos = []
29 table_end_pos = []
30 # `[[wikt:/|}]]` in Template:Mon standard keyboard
31 # and `{{l|mul|} }}` in Template:punctuation are not end of table token
32 # but `|}]]` in Template:Lithuania map is a table
33 for m in re.finditer(
34 r"""
35 (?<!{){\| # `{|` not after `{`, like `{{{|}}}`
36 |
37 \|}(?!\s*}) # `|}` not before ` }`
38 """,
39 page.body,
40 re.VERBOSE,
41 ):
42 if m.group() == "{|":
43 table_start_pos.append(m.start())
44 else:
45 table_end_pos.append(m.end())
46 num_table_start = len(table_start_pos)
47 num_table_end = len(table_end_pos)
48 contains_unpaired_table = num_table_start != num_table_end
49 table_start = len(page.body)
50 table_end = table_start
51 if num_table_start > num_table_end and num_table_end > 0: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 table_start = table_start_pos[num_table_start - num_table_end - 1]
53 table_end = table_end_pos[-1]
54 elif num_table_start < num_table_end and num_table_start > 0: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 table_start = table_start_pos[0]
56 table_end = table_end_pos[num_table_start]
57 elif num_table_start > 0 and num_table_end > 0:
58 table_start = table_start_pos[0]
59 table_end = table_end_pos[-1]
60 unpaired_text = page.body[:table_start] + page.body[table_end:]
62 # Determine if the template contains table element tokens
63 # outside paired table start/end. We only try to look for
64 # these outside templates, as it is common to write each
65 # template argument on its own line starting with a "|".
66 outside = unpaired_text
67 while True:
68 # print("=== OUTSIDE ITER")
69 prev = outside
71 # handle {{{ }}} parameters without templates inside them
72 while True:
73 newt = re.sub(
74 # re.X, ignore white space and comments
75 r"""(?sx)\{\{\{ # {{{
76 ( [^{}] # no {} except...
77 | \}[^}] # no }} unless...
78 | \}\}[^}] # they're definitely not }}}
79 )*?
80 \}\}\} # }}}
81 """,
82 "",
83 prev,
84 )
85 if newt == prev:
86 break
87 prev = newt
88 # print("After arg elim: {!r}".format(newt))
90 # Handle templates
91 newt = re.sub(
92 r"""(?sx)\{\{
93 ( [^{}]
94 | \}[^}]
95 )*?
96 \}\}""",
97 "",
98 newt,
99 )
100 # print("After templ elim: {!r}".format(newt))
101 if newt == outside:
102 break
103 outside = newt
104 # Check if the template contains certain table elements
105 # start of line plus |+, |- or |!
106 m = re.search(r"(?s)(^|\n)(\|\+|\|-|\!)", outside)
107 m2 = re.match(r"(?si)\s*(<includeonly>|<!--.*?-->)(\|\||!!)", outside)
108 contains_table_element = m is not None or m2 is not None
109 # if contains_table_element:
110 # print("contains_table_element {!r} at {}"
111 # .format(m.group(0), m.start()))
112 # print("... {!r} ...".format(outside[m.start() - 10:m.end() + 10]))
113 # print(repr(outside))
115 # Check for unpaired HTML tags
116 tag_cnts: defaultdict[str, int] = defaultdict(int)
117 for m in re.finditer(
118 r"(?si)<(/)?({})\b\s*[^>]*(/)?>" r"".format(
119 "|".join(wtp.paired_html_tags)
120 ),
121 outside,
122 ):
123 start_slash = m.group(1)
124 tagname = m.group(2)
125 end_slash = m.group(3)
126 if start_slash:
127 tag_cnts[tagname] -= 1
128 elif not end_slash: 128 ↛ 117line 128 didn't jump to line 117 because the condition on line 128 was always true
129 tag_cnts[tagname] += 1
130 contains_unbalanced_html = any(v != 0 for v in tag_cnts.values())
131 # if contains_unbalanced_html:
132 # print(name, "UNBALANCED HTML")
133 # for k, v in tag_cnts.items():
134 # if v != 0:
135 # print(" {} {}".format(v, k))
137 # Determine which other templates are called from unpaired text.
138 # None of the flags we currently gather propagate outside a paired
139 # table start/end.
140 for m in re.finditer(
141 # capture the first parameter of a template, ie. the name
142 r"""(?sx)(^ | [^{]) # start
143 (\{\{)?\{\{([^{]*?) # ( ({{) {{ (name) )
144 (\| | \}\}) # | or }}""",
145 unpaired_text,
146 ):
147 called_template = m.group(3)
148 called_template = re.sub(r"(?si)<nowiki\s*/>", "", called_template)
149 if len(called_template) > 0: 149 ↛ 140line 149 didn't jump to line 140 because the condition on line 149 was always true
150 included_templates.add(called_template)
152 # Determine whether this template should be pre-expanded
153 pre_expand = (
154 contains_list
155 or contains_unpaired_table
156 or contains_table_element
157 or contains_unbalanced_html
158 )
160 return included_templates, pre_expand