Coverage for src/wiktextract/extractor/th/descendant.py: 97%

1from mediawiki_langcodes import code_to_name

2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

4from ...page import clean_node

5from ...wxr_context import WiktextractContext

6from ..ruby import extract_ruby

7from .models import Descendant, WordEntry

10def extract_descendant_section(

11 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode

12):

13 for t_node in level_node.find_child(NodeKind.TEMPLATE):

14 if t_node.template_name in ["CJKV", "Sinoxenic-word"]: 14 ↛ 13line 14 didn't jump to line 13 because the condition on line 14 was always true

15 extract_cjkv_template(wxr, word_entry, t_node)

17 for list_node in level_node.find_child(NodeKind.LIST):

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

19 extract_desc_list_item(wxr, word_entry, [], list_item)

22def extract_desc_list_item(

23 wxr: WiktextractContext,

24 word_entry: WordEntry,

25 parent_data: list[Descendant],

26 list_item: WikiNode,

27):

28 desc_list = []

29 for node in list_item.children:

30 if isinstance(node, TemplateNode) and node.template_name in [

31 "desc",

32 "descendant",

33 "desctree",

34 "descendants tree",

35 ]:

36 desc_list.extend(

37 extract_desc_template(wxr, word_entry, parent_data, node)

38 )

39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

40 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

41 extract_desc_list_item(

42 wxr, word_entry, desc_list, child_list_item

43 )

46def extract_desc_template(

47 wxr: WiktextractContext,

48 word_entry: WordEntry,

49 parent_data: list[Descendant],

50 t_node: TemplateNode,

51) -> list[Descendant]:

52 desc_data = []

53 expanded_node = wxr.wtp.parse(

54 wxr.wtp.node_to_wikitext(t_node), expand_all=True

55 )

56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

57 lang_name = code_to_name(lang_code, "th") or "unknown"

58 for span_tag in expanded_node.find_html("span"):

59 span_lang = span_tag.attrs.get("lang", "")

60 span_class = span_tag.attrs.get("class", "").split()

61 if (span_lang.endswith("-Latn") or "tr" in span_class) and len(

62 desc_data

63 ) > 0:

64 desc_data[-1].roman = clean_node(wxr, None, span_tag)

65 elif "mention-gloss" in span_class and len(desc_data) > 0:

66 desc_data[-1].sense = clean_node(wxr, None, span_tag)

67 elif span_lang == lang_code:

68 desc_data.append(

69 Descendant(

70 lang_code=lang_code,

71 lang=lang_name,

72 word=clean_node(wxr, None, span_tag),

73 )

74 )

76 if len(parent_data) > 0:

77 for p_data in parent_data:

78 p_data.descendants.extend(desc_data)

79 else:

80 word_entry.descendants.extend(desc_data)

81 clean_node(wxr, word_entry, expanded_node)

82 return desc_data

85def extract_cjkv_template(

86 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

87):

88 expanded_node = wxr.wtp.parse(

89 wxr.wtp.node_to_wikitext(t_node), expand_all=True

90 )

91 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):

92 desc_data = Descendant(word="", lang="unknown", lang_code="unknown")

93 for node in list_item.children:

94 if (

95 isinstance(node, str)

96 and node.strip().endswith(":")

97 and desc_data.lang == "unknown"

98 ):

99 desc_data.lang = node.strip(": ")

100 elif isinstance(node, HTMLNode) and node.tag == "span":

101 span_class = node.attrs.get("class", "")

102 if span_class == "desc-arr":

103 raw_tag = node.attrs.get("title", "")

104 if raw_tag != "": 104 ↛ 93line 104 didn't jump to line 93 because the condition on line 104 was always true

105 desc_data.raw_tags.append(raw_tag)

106 elif span_class == "tr":

107 desc_data.roman = clean_node(wxr, None, node)

108 elif "lang" in node.attrs:

109 desc_data.lang_code = node.attrs["lang"]

110 ruby_data, nodes_without_ruby = extract_ruby(wxr, node)

111 desc_data.ruby = ruby_data

112 desc_data.word = clean_node(wxr, None, nodes_without_ruby)

113 if desc_data.word != "": 113 ↛ 91line 113 didn't jump to line 91 because the condition on line 113 was always true

114 word_entry.descendants.append(desc_data)

Coverage for src / wiktextract / extractor / th / descendant.py: 97%

63 statements