1
+ # define colors for error messages
2
+ red=' \033[0;31m'
3
+ RED=' \033[1;31m'
4
+ green=' \033[0;32m'
5
+ GREEN=' \033[1;32m'
6
+ yellow=' \033[0;33m'
7
+ YELLOW=' \033[1;33m'
8
+ blue=' \033[0;34m'
9
+ BLUE=' \033[1;34m'
10
+ purple=' \033[0;35m'
11
+ PURPLE=' \033[1;35m'
12
+ cyan=' \033[0;36m'
13
+ CYAN=' \033[1;36m'
14
+ NC=' \033[0m'
15
+
16
+ # usage function
17
+ usage () {
18
+ echo -e " usage: ${YELLOW} $0 ${NC} [options]"
19
+ echo -e " Creating a taxonomy.tab for a buggy krona script, primarily for the custom taxids from this repo"
20
+ echo -e " OPTIONS:"
21
+ echo -e " -h show this message"
22
+ echo -e " -i names.dmp and nodes.dmp containing directory"
23
+ echo -e " -o output taxonomy.tab file. Default is the same location as the -i parameter "
24
+ echo -e " "
25
+ }
26
+
27
+ gawk_install () {
28
+ echo -e " " >&2
29
+ echo -e " ${RED} Please make sure gawk is installed.${NC} " >&2
30
+ echo -e " " >&2
31
+ echo -e " " >&2
32
+ }
33
+ awk_version=$( gawk --version | head -n1)
34
+
35
+
36
+ # ---------------------------------------------------------------------------------------------------
37
+ # set default values here
38
+
39
+ # ---------------------------------------------------------------------------------------------------
40
+ # parse input arguments
41
+ while getopts " hi:o:" OPTION
42
+ do
43
+ case $OPTION in
44
+ h) usage; exit 1 ;;
45
+ i) input=$OPTARG ;;
46
+ o) output=$OPTARG ;;
47
+ ? ) usage; exit ;;
48
+ esac
49
+ done
50
+ # check input arguments
51
+ if [[ -z " $output " ]]; then
52
+ echo -e " ${CYAN} Warning: no output path for tab file specified, putting in -i ${input} ${NC} " >&2
53
+ output=" $input /taxonomy.tab"
54
+ fi
55
+ if [[ -z " $input " ]]; then
56
+ echo -e " ${RED} ERROR: no input path that contains a names and nodes dmp file -i ${input} ${NC} " >&2
57
+ usage
58
+ exit 2
59
+ fi
60
+
61
+ if [[ ! -s " $input /names.dmp" ]] || [[ ! -s " $input /nodes.dmp" ]]; then
62
+ echo -e " ${RED} ERROR: names or nodes.dmp file dont exist in $input -i ${input} , exiting.... ${NC} " >&2
63
+ usage
64
+ exit 2
65
+ fi
66
+ echo $input
67
+ echo $output
68
+
69
+
70
+ # names.dmp file
71
+ # 1 is taxid (current)
72
+ # 2 is the text label of taxid
73
+ # 3 is category of (current) e.g. synonym, common name, scientific name, etc
74
+
75
+ # nodes.dmp file
76
+ # 1 is child taxid (current)
77
+ # 2 is parent taxid
78
+ # 3 is tax rank of child (current)
79
+
80
+ gawk -F " \t" '
81
+ {
82
+ if ( NR==FNR){
83
+ mapping[$1] = $5
84
+ } else {
85
+ if ($7 != "" ){
86
+ names[$1][length(names[$1]) + 1] = $3
87
+ }
88
+ }
89
+
90
+ } END {
91
+ i=0
92
+ print "name\ttaxid\trank"
93
+ for (name in names){
94
+ if (name in mapping){
95
+ for (i=1; i<=length(names[name]); i++){
96
+ print names[name][i]"\t"name"\t"mapping[name]
97
+ }
98
+ }
99
+ }
100
+ }
101
+
102
+ ' $input /nodes.dmp $input /names.dmp > $output
0 commit comments