Coverage for src/wiktextract/extractor/th/etymology.py: 37%

1from wikitextprocessor import (

2 HTMLNode,

3 LevelNode,

4 NodeKind,

5 TemplateNode,

6 WikiNode,

9from ...page import clean_node

10from ...wxr_context import WiktextractContext

11from .models import Form, WordEntry

12from .tags import translate_raw_tags

15def extract_etymology_section(

16 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

17):

18 e_nodes = []

19 for node in level_node.children:

20 if isinstance(node, TemplateNode) and ( 20 ↛ 24line 20 didn't jump to line 24 because the condition on line 20 was never true

21 node.template_name.endswith("-kanjitab")

22 or node.template_name == "ja-kt"

23 ):

24 extract_ja_kanjitab_template(wxr, node, base_data)

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

26 for list_item in node.find_child(NodeKind.LIST_ITEM):

27 e_text = clean_node(wxr, base_data, list_item.children)

28 if e_text != "": 28 ↛ 26line 28 didn't jump to line 26 because the condition on line 28 was always true

29 base_data.etymology_texts.append(e_text)

30 elif not (

31 isinstance(node, LevelNode)

32 or (

33 isinstance(node, TemplateNode)

34 and node.template_name in ["ja-see", "ja-see-kango"]

35 )

36 ):

37 e_nodes.append(node)

39 if len(e_nodes) > 0: 39 ↛ exitline 39 didn't return from function 'extract_etymology_section' because the condition on line 39 was always true

40 e_str = clean_node(wxr, base_data, e_nodes)

41 if e_str != "":

42 base_data.etymology_texts.append(e_str)

45def extract_ja_kanjitab_template(

46 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry

47):

48 # https://th.wiktionary.org/wiki/Template:ja-kanjitab

49 expanded_node = wxr.wtp.parse(

50 wxr.wtp.node_to_wikitext(t_node), expand_all=True

51 )

52 for table in expanded_node.find_child(NodeKind.TABLE):

53 is_alt_form_table = False

54 for row in table.find_child(NodeKind.TABLE_ROW):

55 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):

56 header_text = clean_node(wxr, None, header_node)

57 if header_text.startswith("การสะกดแบบอื่น"):

58 is_alt_form_table = True

59 if not is_alt_form_table:

60 continue

61 forms = []

62 for row in table.find_child(NodeKind.TABLE_ROW):

63 for cell_node in row.find_child(NodeKind.TABLE_CELL):

64 for child_node in cell_node.children:

65 if isinstance(child_node, HTMLNode):

66 if child_node.tag == "span":

67 word = clean_node(wxr, None, child_node)

68 if word != "":

69 forms.append(

70 Form(

71 form=word, tags=["alternative", "kanji"]

72 )

73 )

74 elif child_node.tag == "small":

75 raw_tag = clean_node(wxr, None, child_node).strip(

76 "()"

77 )

78 if raw_tag != "" and len(forms) > 0:

79 forms[-1].raw_tags.append(raw_tag)

80 translate_raw_tags(forms[-1])

81 base_data.forms.extend(forms)

82 for link_node in expanded_node.find_child(NodeKind.LINK):

83 clean_node(wxr, base_data, link_node)

Coverage for src / wiktextract / extractor / th / etymology.py: 37%

49 statements