Coverage for src/wiktextract/extractor/ko/etymology.py: 43%

1from wikitextprocessor import HTMLNode, LevelNode, NodeKind, TemplateNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from .models import Form, WordEntry

6from .tags import translate_raw_tags

9def extract_etymology_section(

10 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

11) -> None:

12 if len(word_entry.etymology_texts) > 0:

13 word_entry.etymology_texts.clear()

14 word_entry.categories.clear()

16 has_list = False

17 for list_node in level_node.find_child(NodeKind.LIST):

18 has_list = True

19 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

20 text = clean_node(wxr, word_entry, list_item.children)

21 if len(text) > 0: 21 ↛ 19line 21 didn't jump to line 19 because the condition on line 21 was always true

22 word_entry.etymology_texts.append(text)

24 if not has_list:

25 e_nodes = []

26 for node in level_node.children:

27 if isinstance(node, TemplateNode) and ( 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was never true

28 node.template_name.endswith("-kanjitab")

29 or node.template_name == "ja-kt"

30 ):

31 extract_ja_kanjitab_template(wxr, node, word_entry)

32 elif isinstance(node, LevelNode):

33 break

34 else:

35 e_nodes.append(node)

37 text = clean_node(wxr, word_entry, e_nodes)

38 if len(text) > 0:

39 word_entry.etymology_texts.append(text)

42def extract_ja_kanjitab_template(

43 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry

44):

45 expanded_node = wxr.wtp.parse(

46 wxr.wtp.node_to_wikitext(t_node), expand_all=True

47 )

48 for table in expanded_node.find_child(NodeKind.TABLE):

49 is_alt_form_table = False

50 for row in table.find_child(NodeKind.TABLE_ROW):

51 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):

52 header_text = clean_node(wxr, None, header_node)

53 if header_text == "다른 표기":

54 is_alt_form_table = True

55 if not is_alt_form_table:

56 continue

57 forms = []

58 for row in table.find_child(NodeKind.TABLE_ROW):

59 for cell_node in row.find_child(NodeKind.TABLE_CELL):

60 for child_node in cell_node.children:

61 if isinstance(child_node, HTMLNode):

62 if child_node.tag == "span":

63 word = clean_node(wxr, None, child_node)

64 if word != "":

65 forms.append(

66 Form(

67 form=word, tags=["alternative", "kanji"]

68 )

69 )

70 elif child_node.tag == "small":

71 raw_tag = clean_node(wxr, None, child_node).strip(

72 "()"

73 )

74 if raw_tag != "" and len(forms) > 0:

75 forms[-1].raw_tags.append(raw_tag)

76 translate_raw_tags(forms[-1])

77 base_data.forms.extend(forms)

78 for link_node in expanded_node.find_child(NodeKind.LINK):

79 clean_node(wxr, base_data, link_node)

Coverage for src / wiktextract / extractor / ko / etymology.py: 43%

55 statements