You are here

taxonomy-slicer.sh in Taxonomy import/export via XML 6.2

#!/bin/bash
###############################################################################
#
# Step by step script to 
# - fetch the remote vocabulary dump file
# - select a subset of the available nemes - possibly just one item
# - select the related ancestors and decendants to construct a context for the term
# - Output a CSV file that can be imported into Drupal taxonomy.
#
# This is specifically designed to massage and import ONLY the raw dump files provided by NCBI
# ftp://ftp.ncbi.nih.gov/pub/taxonomy/
#
# @author dman dan@coders.co.nz
###############################################################################

###############################################################################
# SETTINGS
# Add to this list for greater depth
ancestors="parent grandparent greatgrandparent 4parent 5parent 6parent 7parent 8parent 9parent 10parent 11parent 12parent 13parent 14parent 15parent 16parent 17parent 18parent  ";
descendants="child grandchild greatgrandchild 4child 5child";

# or keep it in the family
ancestors="parent";
descendants="child";

# Choose an ID or name pattern to focus on
# eg a list of apteryxs
pattern="Apteryx";
# or id 
pattern="^3627\t";
# or ids 8800-8999
pattern="^8[8-9][0-9][0-9]\t";

###############################################################################
# BEGIN
#

# Fetch dump if needed
#
if [ -f names.dmp ] ; then
 echo "The dump file is available already";
else
  wget wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  tar -xzf taxdump.tar.gz
fi

# Select a number range to focus the taxonomy building on
#
# Fetch the selection
sed -n "/$pattern/p" names.dmp | awk 'BEGIN {FS="|"} {print $1}' | sort| uniq > subset-ids.txt
subsets=`cat subset-ids.txt`
echo $subsets


###############################################################################
# Loop the parent-finding process several times, collating the resulting IDs
#
cp subset-ids.txt subset-parent-ids.txt
for lineage in $ancestors; do
  echo "Retrieving $lineage IDs"; 
  subsetparents=;
  # scan for grandparents of the subsets
  subsetparents=`cat subset-parent-ids.txt`;
  subpattern=dummy;
  # subpattern is a big regexp containing all current IDs
  for subid in $subsetparents; do subpattern="$subpattern\|^$subid\t" ; done;
  #echo $subpattern;
  sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $2}' >> subset-parent-ids.txt;
  sort subset-parent-ids.txt | uniq > subset-parents.uniq; mv subset-parents.uniq subset-parent-ids.txt;
done;

# We now have a list of ids that are higer (and also probably include) our subsets

echo "IDs for several generations up are :"
cat subset-parent-ids.txt

###############################################################################
# Loop the child-finding process several times, collating the resulting IDs
# Scan to match the second column (parents), retrieving the first col (child)
#
cp subset-ids.txt subset-child-ids.txt
for lineage in $descendants ; do
  echo "Retrieving $lineage IDs"; 
  subsetparents=;
  # scan for children of the current set
  subsetset=`cat subset-child-ids.txt`;
  subpattern=dummy;
  # subpattern is a big regexp containing all current IDs
  for subid in $subsetset; do subpattern="$subpattern\|\t$subid\t" ; done;
  #echo $subpattern;
  sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $1}' >> subset-child-ids.txt;
  sort subset-child-ids.txt | uniq > subset-child.uniq; mv subset-child.uniq subset-child-ids.txt;
done;

echo "IDs for several generations down are :"
cat subset-child-ids.txt
# we now have a list of ids that are lower (and also probably include) our subsets


###############################################################################
# Now retrieve names of all the nodes of interest, for review
idsofinterest=`cat subset-child-ids.txt subset-parent-ids.txt | sort | uniq`

echo "Constructing a triplestore CSV describing each element of interest"

subpattern=dummy;
for subid in $idsofinterest; do subpattern="$subpattern\|^$subid\t" ; done;

sed -n "/$subpattern/p" names.dmp | grep "scientific name" | awk 'BEGIN {FS="|"} {print $1,", name ,", $2}' > subset-all-triples.csv
sed -n "/$subpattern/p" names.dmp | grep "synonym" | awk 'BEGIN {FS="|"} {print $1, ", Used for ,", $2}' >> subset-all-triples.csv
sed -n "/$subpattern/p" names.dmp | sed -n "/\tcommon name/p" | awk 'BEGIN {FS="|"} {print $1,", Definition ,Common Name:", $2}' >> subset-all-triples.csv
sed -n "/$subpattern/p" names.dmp | awk 'BEGIN {FS="|"} {print $1,", Definition ,GenBankID:", $1}' | uniq >> subset-all-triples.csv

# Retrieve all parent relationships
sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $1,", Broader Terms ,", $2}' >> subset-all-triples.csv

# Sort it to keep indivitual terms mostly together
sort -g subset-all-triples.csv | uniq > subset-all-triples.uniq; mv subset-all-triples.uniq  subset-all-triples.csv;

File

samples/taxonomy-slicer.sh
View source
  1. #!/bin/bash
  2. ###############################################################################
  3. #
  4. # Step by step script to
  5. # - fetch the remote vocabulary dump file
  6. # - select a subset of the available nemes - possibly just one item
  7. # - select the related ancestors and decendants to construct a context for the term
  8. # - Output a CSV file that can be imported into Drupal taxonomy.
  9. #
  10. # This is specifically designed to massage and import ONLY the raw dump files provided by NCBI
  11. # ftp://ftp.ncbi.nih.gov/pub/taxonomy/
  12. #
  13. # @author dman dan@coders.co.nz
  14. ###############################################################################
  15. ###############################################################################
  16. # SETTINGS
  17. # Add to this list for greater depth
  18. ancestors="parent grandparent greatgrandparent 4parent 5parent 6parent 7parent 8parent 9parent 10parent 11parent 12parent 13parent 14parent 15parent 16parent 17parent 18parent ";
  19. descendants="child grandchild greatgrandchild 4child 5child";
  20. # or keep it in the family
  21. ancestors="parent";
  22. descendants="child";
  23. # Choose an ID or name pattern to focus on
  24. # eg a list of apteryxs
  25. pattern="Apteryx";
  26. # or id
  27. pattern="^3627\t";
  28. # or ids 8800-8999
  29. pattern="^8[8-9][0-9][0-9]\t";
  30. ###############################################################################
  31. # BEGIN
  32. #
  33. # Fetch dump if needed
  34. #
  35. if [ -f names.dmp ] ; then
  36. echo "The dump file is available already";
  37. else
  38. wget wget ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
  39. tar -xzf taxdump.tar.gz
  40. fi
  41. # Select a number range to focus the taxonomy building on
  42. #
  43. # Fetch the selection
  44. sed -n "/$pattern/p" names.dmp | awk 'BEGIN {FS="|"} {print $1}' | sort| uniq > subset-ids.txt
  45. subsets=`cat subset-ids.txt`
  46. echo $subsets
  47. ###############################################################################
  48. # Loop the parent-finding process several times, collating the resulting IDs
  49. #
  50. cp subset-ids.txt subset-parent-ids.txt
  51. for lineage in $ancestors; do
  52. echo "Retrieving $lineage IDs";
  53. subsetparents=;
  54. # scan for grandparents of the subsets
  55. subsetparents=`cat subset-parent-ids.txt`;
  56. subpattern=dummy;
  57. # subpattern is a big regexp containing all current IDs
  58. for subid in $subsetparents; do subpattern="$subpattern\|^$subid\t" ; done;
  59. #echo $subpattern;
  60. sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $2}' >> subset-parent-ids.txt;
  61. sort subset-parent-ids.txt | uniq > subset-parents.uniq; mv subset-parents.uniq subset-parent-ids.txt;
  62. done;
  63. # We now have a list of ids that are higer (and also probably include) our subsets
  64. echo "IDs for several generations up are :"
  65. cat subset-parent-ids.txt
  66. ###############################################################################
  67. # Loop the child-finding process several times, collating the resulting IDs
  68. # Scan to match the second column (parents), retrieving the first col (child)
  69. #
  70. cp subset-ids.txt subset-child-ids.txt
  71. for lineage in $descendants ; do
  72. echo "Retrieving $lineage IDs";
  73. subsetparents=;
  74. # scan for children of the current set
  75. subsetset=`cat subset-child-ids.txt`;
  76. subpattern=dummy;
  77. # subpattern is a big regexp containing all current IDs
  78. for subid in $subsetset; do subpattern="$subpattern\|\t$subid\t" ; done;
  79. #echo $subpattern;
  80. sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $1}' >> subset-child-ids.txt;
  81. sort subset-child-ids.txt | uniq > subset-child.uniq; mv subset-child.uniq subset-child-ids.txt;
  82. done;
  83. echo "IDs for several generations down are :"
  84. cat subset-child-ids.txt
  85. # we now have a list of ids that are lower (and also probably include) our subsets
  86. ###############################################################################
  87. # Now retrieve names of all the nodes of interest, for review
  88. idsofinterest=`cat subset-child-ids.txt subset-parent-ids.txt | sort | uniq`
  89. echo "Constructing a triplestore CSV describing each element of interest"
  90. subpattern=dummy;
  91. for subid in $idsofinterest; do subpattern="$subpattern\|^$subid\t" ; done;
  92. sed -n "/$subpattern/p" names.dmp | grep "scientific name" | awk 'BEGIN {FS="|"} {print $1,", name ,", $2}' > subset-all-triples.csv
  93. sed -n "/$subpattern/p" names.dmp | grep "synonym" | awk 'BEGIN {FS="|"} {print $1, ", Used for ,", $2}' >> subset-all-triples.csv
  94. sed -n "/$subpattern/p" names.dmp | sed -n "/\tcommon name/p" | awk 'BEGIN {FS="|"} {print $1,", Definition ,Common Name:", $2}' >> subset-all-triples.csv
  95. sed -n "/$subpattern/p" names.dmp | awk 'BEGIN {FS="|"} {print $1,", Definition ,GenBankID:", $1}' | uniq >> subset-all-triples.csv
  96. # Retrieve all parent relationships
  97. sed -n "/$subpattern/p" nodes.dmp | awk 'BEGIN {FS="|"} {print $1,", Broader Terms ,", $2}' >> subset-all-triples.csv
  98. # Sort it to keep indivitual terms mostly together
  99. sort -g subset-all-triples.csv | uniq > subset-all-triples.uniq; mv subset-all-triples.uniq subset-all-triples.csv;