Coverage for src / wiktextract / extractor / th / etymology.py: 37%

49 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from wikitextprocessor import ( 

2 HTMLNode, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Form, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_etymology_section( 

16 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

17): 

18 e_nodes = [] 

19 for node in level_node.children: 

20 if isinstance(node, TemplateNode) and ( 20 ↛ 24line 20 didn't jump to line 24 because the condition on line 20 was never true

21 node.template_name.endswith("-kanjitab") 

22 or node.template_name == "ja-kt" 

23 ): 

24 extract_ja_kanjitab_template(wxr, node, base_data) 

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

26 for list_item in node.find_child(NodeKind.LIST_ITEM): 

27 e_text = clean_node(wxr, base_data, list_item.children) 

28 if e_text != "": 28 ↛ 26line 28 didn't jump to line 26 because the condition on line 28 was always true

29 base_data.etymology_texts.append(e_text) 

30 elif not ( 

31 isinstance(node, LevelNode) 

32 or ( 

33 isinstance(node, TemplateNode) 

34 and node.template_name in ["ja-see", "ja-see-kango"] 

35 ) 

36 ): 

37 e_nodes.append(node) 

38 

39 if len(e_nodes) > 0: 39 ↛ exitline 39 didn't return from function 'extract_etymology_section' because the condition on line 39 was always true

40 e_str = clean_node(wxr, base_data, e_nodes) 

41 if e_str != "": 

42 base_data.etymology_texts.append(e_str) 

43 

44 

45def extract_ja_kanjitab_template( 

46 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry 

47): 

48 # https://th.wiktionary.org/wiki/Template:ja-kanjitab 

49 expanded_node = wxr.wtp.parse( 

50 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

51 ) 

52 for table in expanded_node.find_child(NodeKind.TABLE): 

53 is_alt_form_table = False 

54 for row in table.find_child(NodeKind.TABLE_ROW): 

55 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

56 header_text = clean_node(wxr, None, header_node) 

57 if header_text.startswith("การสะกดแบบอื่น"): 

58 is_alt_form_table = True 

59 if not is_alt_form_table: 

60 continue 

61 forms = [] 

62 for row in table.find_child(NodeKind.TABLE_ROW): 

63 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

64 for child_node in cell_node.children: 

65 if isinstance(child_node, HTMLNode): 

66 if child_node.tag == "span": 

67 word = clean_node(wxr, None, child_node) 

68 if word != "": 

69 forms.append( 

70 Form( 

71 form=word, tags=["alternative", "kanji"] 

72 ) 

73 ) 

74 elif child_node.tag == "small": 

75 raw_tag = clean_node(wxr, None, child_node).strip( 

76 "()" 

77 ) 

78 if raw_tag != "" and len(forms) > 0: 

79 forms[-1].raw_tags.append(raw_tag) 

80 translate_raw_tags(forms[-1]) 

81 base_data.forms.extend(forms) 

82 for link_node in expanded_node.find_child(NodeKind.LINK): 

83 clean_node(wxr, base_data, link_node)