Coverage for src / wiktextract / extractor / ko / etymology.py: 43%

55 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from wikitextprocessor import HTMLNode, LevelNode, NodeKind, TemplateNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .models import Form, WordEntry 

6from .tags import translate_raw_tags 

7 

8 

9def extract_etymology_section( 

10 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

11) -> None: 

12 if len(word_entry.etymology_texts) > 0: 

13 word_entry.etymology_texts.clear() 

14 word_entry.categories.clear() 

15 

16 has_list = False 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 has_list = True 

19 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

20 text = clean_node(wxr, word_entry, list_item.children) 

21 if len(text) > 0: 21 ↛ 19line 21 didn't jump to line 19 because the condition on line 21 was always true

22 word_entry.etymology_texts.append(text) 

23 

24 if not has_list: 

25 e_nodes = [] 

26 for node in level_node.children: 

27 if isinstance(node, TemplateNode) and ( 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was never true

28 node.template_name.endswith("-kanjitab") 

29 or node.template_name == "ja-kt" 

30 ): 

31 extract_ja_kanjitab_template(wxr, node, word_entry) 

32 elif isinstance(node, LevelNode): 

33 break 

34 else: 

35 e_nodes.append(node) 

36 

37 text = clean_node(wxr, word_entry, e_nodes) 

38 if len(text) > 0: 

39 word_entry.etymology_texts.append(text) 

40 

41 

42def extract_ja_kanjitab_template( 

43 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry 

44): 

45 expanded_node = wxr.wtp.parse( 

46 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

47 ) 

48 for table in expanded_node.find_child(NodeKind.TABLE): 

49 is_alt_form_table = False 

50 for row in table.find_child(NodeKind.TABLE_ROW): 

51 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

52 header_text = clean_node(wxr, None, header_node) 

53 if header_text == "다른 표기": 

54 is_alt_form_table = True 

55 if not is_alt_form_table: 

56 continue 

57 forms = [] 

58 for row in table.find_child(NodeKind.TABLE_ROW): 

59 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

60 for child_node in cell_node.children: 

61 if isinstance(child_node, HTMLNode): 

62 if child_node.tag == "span": 

63 word = clean_node(wxr, None, child_node) 

64 if word != "": 

65 forms.append( 

66 Form( 

67 form=word, tags=["alternative", "kanji"] 

68 ) 

69 ) 

70 elif child_node.tag == "small": 

71 raw_tag = clean_node(wxr, None, child_node).strip( 

72 "()" 

73 ) 

74 if raw_tag != "" and len(forms) > 0: 

75 forms[-1].raw_tags.append(raw_tag) 

76 translate_raw_tags(forms[-1]) 

77 base_data.forms.extend(forms) 

78 for link_node in expanded_node.find_child(NodeKind.LINK): 

79 clean_node(wxr, base_data, link_node)