#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

# archive-taxonomy

total_start=$(date "+%s")

pth=$( dirname "$0" )

case "$pth" in
  /* )
    ;; # already absolute
  *  )
    pth=$(cd "$pth" && pwd)
    ;;
esac

case ":$PATH:" in
  *:"$pth":* )
    ;;
  * )
    PATH="$PATH:$pth"
    export PATH
    ;;
esac

# initialize specific flags

hasgo=$( command -v go )
if [ ! -x "$hasgo" ]
then
  echo "ERROR: The Go (golang) compiler must be installed locally in order to process the data, EXITING" >&2
  exit 1
fi

# database-specific parameters

dbase="taxonomy"
recname="TaxonInfo"
dotmaxIdx="500"
dotmaxInv="10"
fields="SCIN RANK GNSP COMN TXDV TXSY TAXA LNGE TREE MODS PROP VCHR GC HGC MGC PGC UID"

# control flags set by command-line arguments

useFtp=true
useHttps=false
noAspera=false

info=false

clean=false
scrub=false
scour=false
erase=false
zap=false

datafiles=true
download=true
populate=true

e2index=false
e2invert=false
e2collect=false
e2merge=false
e2post=false

while [ $# -gt 0 ]
do
  case "$1" in
    daily | -daily )
      e2index=true
      e2invert=true
      datafiles=true
      shift
      ;;
    index | -index | reindex | -reindex )
      e2index=true
      e2invert=true
      e2collect=true
      e2merge=true
      e2post=true
      datafiles=true
      shift
      ;;
    clean | -clean | clear | -clear )
      # delete Indices contents and Increment files
      clean=true
      shift
      ;;
    scrub | -scrub )
      clean=true
      # and delete Postings directories
      scrub=true
      shift
      ;;
    scour | -scour )
      clean=true
      scrub=true
      # and delete Data, Archive, and Sentinels directories
      scour=true
      shift
      ;;
    erase | -erase )
      clean=true
      scrub=true
      scour=true
      # and delete Extras directory contents
      erase=true
      shift
      ;;
    zap | -zap )
      clean=true
      scrub=true
      scour=true
      # and delete Source records and all remaining directories
      zap=true
      shift
      ;;
    -info )
      info=true
      shift
      ;;
    -ftp )
      useFtp=true
      useHttps=false
      noAspera=true
      export EDIRECT_NO_ASPERA=true
      shift
      ;;
    -http | -https )
      useFtp=false
      useHttps=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

if [ "$clean" == true ] && [ "$e2index" == true ]
then
  echo "ERROR: Cleaning and indexing must be done in separate commands, EXITING" >&2
  exit 1
fi

if [ "$zap" = true ]
then
  pm-clean -db "$dbase" -zap
  exit 0
fi

if [ "$erase" = true ]
then
  pm-clean -db "$dbase" -erase
  exit 0
fi

if [ "$scour" = true ]
then
  pm-clean -db "$dbase" -scour
  exit 0
fi

if [ "$scrub" = true ]
then
  pm-clean -db "$dbase" -scrub
  exit 0
fi

if [ "$clean" = true ]
then
  pm-clean -db "$dbase" -clean
  exit 0
fi

# get path to local folder

osname=$( uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/' )

GetLocalArchiveFolder() {

  dbs="$1"
  fld="$2"

  # find selected local archive folder from environment variables or configuration file
  target=$( rchive -local "$dbs" "$fld" )

  if [ -z "$target" ] || [ "$target" = "" ]
  then
    echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_ARCHIVE environment variable" >&2
    exit 1
  fi

  if [ -n "$osname" ] && [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
  then
    target=$( cygpath -w "$target" )
  fi

  # remove trailing slash
  target=${target%/}

  echo "$target"
}

date
echo "" >&2

pm-setup -db "$dbase"

echo "Preparing Drives" >&2
pm-prepare -db "$dbase"
echo "" >&2

archiveBase=$( GetLocalArchiveFolder "$dbase" "Archive" )
dataBase=$( GetLocalArchiveFolder "$dbase" "Data" )
extrasBase=$( GetLocalArchiveFolder "$dbase" "Extras" )
indexBase=$( GetLocalArchiveFolder "$dbase" "Index" )
invertBase=$( GetLocalArchiveFolder "$dbase" "Invert" )
mergedBase=$( GetLocalArchiveFolder "$dbase" "Merged" )
postingsBase=$( GetLocalArchiveFolder "$dbase" "Postings" )

DAT=""
DWN=""
POP=""

IDX=""
INV=""
COL=""
MRG=""
PST=""

if [ "$datafiles" = true ]
then
  seconds_start=$(date "+%s")
  echo "Downloading Taxonomy Data Files" >&2

  if [ -d "${extrasBase}" ]
  then
    cd "${extrasBase}"

    if [ ! -f "new_taxdump.tar.gz" ]
    then
      if [ "$useFtp" = true ]
      then
        # uses Aspera Connect (if installed), otherwise FTP
        if [ "$noAspera" = true ]
        then
          nquire -dwn ftp.ncbi.nlm.nih.gov "pub/taxonomy/new_taxdump" "new_taxdump.tar.gz"
        else
          nquire -asp ftp.ncbi.nlm.nih.gov "pub/taxonomy/new_taxdump" "new_taxdump.tar.gz"
        fi
      elif [ "$useHttps" = true ]
      then
        nquire -bulk -get https://ftp.ncbi.nlm.nih.gov pub/taxonomy/new_taxdump new_taxdump.tar.gz > new_taxdump.tar.gz
      fi
    fi

    if [ -f "new_taxdump.tar.gz" ]
    then
      rm -f *.dmp
      gunzip -c new_taxdump.tar.gz |
      tar xf -
    fi
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DAT=$seconds
  echo "DAT $DAT seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$download" = true ]
then
  seconds_start=$(date "+%s")
  echo "Creating Taxonomy Records" >&2

  if [ -d "${extrasBase}" ]
  then
    cd "${extrasBase}"

    rm -f taxoninfo.xml

    go run "$pth/extern/prep-taxoninfo.go" "$pth/data/" > taxoninfo.xml

    if [ -f "${extrasBase}/taxoninfo.xml" ]
    then
      rm -f "${dataBase}/taxoninfo.xml"
      cp "${extrasBase}/taxoninfo.xml" "${dataBase}/taxoninfo.xml"
    fi
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  DWN=$seconds
  echo "DWN $DWN seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$populate" = true ]
then
  seconds_start=$(date "+%s")
  echo "Populating Taxonomy Archive" >&2

  if [ -d "${extrasBase}" ]
  then
    cd "${extrasBase}"

    rchive -gzip -db "$dbase" -input "taxoninfo.xml" \
      -archive "${archiveBase}" "${indexBase}" "${invertBase}" \
      -index TaxID -pattern "$recname" < /dev/null
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  POP=$seconds
  echo "POP $POP seconds" >&2
  echo "" >&2
  sleep 1
fi

# check archive, printing "OK" from "Ochlandra keralensis" if record is successfully retrieved
echo 1949059 |
xfetch -db taxonomy |
xtract -pattern "$recname" -sep "" -pfx "Archive is " \
  -upper "Genus[1:1],Species[1:1]"

echo "" >&2

# variable contains taxonomy-database-specific xtract indexing instructions
read -r -d '' idxtxt <<- EOS
xtract -set IdxDocumentSet -rec IdxDocument \
  -pattern TaxonInfo -UID TaxonInfo/TaxID \
    -wrp IdxUid -element "&UID" -clr -rst -tab "" \
    -group TaxonInfo -pkg IdxSearchFields \
      -block TaxonInfo \
        -wrp UID -pad "&UID" \
        -wrp SCIN -element Scientific \
        -wrp GNSP -element Binomial \
        -wrp COMN -element Common \
        -wrp COMN -element GenBank \
        -wrp TXSY -element Synonym \
        -wrp TXDV -element Division -element Division@code \
        -wrp RANK -element Rank \
        -wrp LNGE -element Lineage \
        -wrp TREE -element Tree \
        -wrp GC -element Nuclear \
        -wrp MGC -element Mitochondrial \
        -wrp PGC -element Plastid \
        -wrp HGC -element Hydrogenosome \
        -wrp TAXA -element Domain \
        -wrp TAXA -element Kingdom \
        -wrp TAXA -element Phylum \
        -wrp TAXA -element Class \
        -wrp TAXA -element Order \
        -wrp TAXA -element Family \
        -wrp TAXA -element Genus \
        -wrp TAXA -element Species \
        -wrp MODS -pfx "<MODS>subspecies " -alnum Subspecies \
        -wrp MODS -pfx "<MODS>serovar " -alnum Serovar \
        -wrp MODS -pfx "<MODS>strain " -alnum Strain \
        -wrp MODS -pfx "<MODS>substrain " -alnum Substrain \
        -wrp MODS -pfx "<MODS>clade " -alnum Clade \
        -wrp MODS -pfx "<MODS>note " -alnum Note \
        -wrp VCHR -element SpID \
      -block TaxonInfo -if Subspecies -wrp PROP -lbl "Has Subspecies" \
      -block TaxonInfo -if Strain -wrp PROP -lbl "Has Strain" \
      -block TaxonInfo -if Substrain -wrp PROP -lbl "Has Substrain" \
      -block TaxonInfo -if Clade -wrp PROP -lbl "Has Clade" \
      -block TaxonInfo -if Serovar -wrp PROP -lbl "Has Serovar"
EOS

if [ "$e2index" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Indexing" >&2

  temp=$(mktemp /tmp/INDEX_TEMP.XXXXXXXXX)
  # generate file with xtract indexing arguments, split onto separate lines, skipping past xtract command itself
  echo "${idxtxt}" | xargs -n1 echo | tail -n +2 > $temp
  rchive -db "$dbase" -e2incIndex "${archiveBase}" "${indexBase}" -idxargs "$temp" -dotmax "$dotmaxIdx" -e2index
  rm "$temp"

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  IDX=$seconds
  echo "IDX $IDX seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2invert" = true ]
then
  seconds_start=$(date "+%s")
  echo "Incremental Inversion" >&2

  rchive -db "$dbase" -dotmax "$dotmaxInv" -e2incInvert "${indexBase}" "${invertBase}"

  if [ -d "${invertBase}" ]
  then
    cd "${invertBase}"

    rm -f *.inv.gz
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  INV=$seconds
  echo "INV $INV seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2collect" = true ]
then
  seconds_start=$(date "+%s")
  echo "Collect Inverted Sets" >&2

  if [ -d "${invertBase}" ]
  then
    cd "${invertBase}"

    rm -f *.inv.gz

    idx=0
    for dir in "${invertBase}"/*
    do
      if [ -d "$dir" ]
      then
        cd "$dir"
        printf "."
        rchive -gzip -join *.inv.gz > "${invertBase}/${dbase}$(printf %02d $idx).inv.gz"
        idx=$(( idx + 1 ))
      fi
    done
    printf "\n"
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  COL=$seconds
  echo "COL $COL seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2merge" = true ]
then
  seconds_start=$(date "+%s")
  echo "Merging Inverted Indices" >&2

  if [ -d "${invertBase}" ]
  then
    cd "${invertBase}"

    rchive -gzip -db "$dbase" -merge "${mergedBase}" *.inv.gz
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  MRG=$seconds
  echo "MRG $MRG seconds" >&2
  echo "" >&2
  sleep 1
  # taxonomy only goes to zyzzyzus warreni
  if [ ! -f "${mergedBase}/zy.mrg.gz" ]
  then
    echo "ERROR: Merge failed to complete - missing zy.mrg.gz file" >&2
    echo "" >&2
    echo "EXITING DUE TO BUILD FAILURE" >&2
    echo "" >&2
    # do not continue
    e2post=false
  fi
fi

if [ "$e2post" = true ]
then
  seconds_start=$(date "+%s")
  echo "Producing Postings Files" >&2

  if [ -d "${mergedBase}" ]
  then
    cd "${mergedBase}"

    for fl in *.mrg.gz
    do
      echo "$fl"
    done |
    sort |
    xargs -n 100 echo |
    while read files
    do
      rchive -db "$dbase" -promote "${postingsBase}" "$fields" $files
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  PST=$seconds
  echo "PST $PST seconds" >&2
  echo "" >&2
  sleep 1
fi

wait

# check postings
okay=""
if [ "$e2post" = true ]
then
  okay=$( xsearch -db "$dbase" -query "ochlandra keralensis [GNSP]" |
  xfetch -db taxonomy |
  xtract -pattern "$recname" -sep "" -upper "Genus[1:1],Species[1:1]" )
  if [ "$okay" = "OK" ]
  then
    echo "Archive and Index are $okay" >&2
    echo "" >&2
  fi
fi

if [ "$e2post" = true ] && [ "$okay" = "OK" ] && [ -d "${mergedBase}" ]
then
  target="${mergedBase}"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete

  if [ -d "${invertBase}" ]
  then
    cd "${invertBase}"
    rm -f *.inv.gz
  fi
fi

cd

echo "ARCHIVE-TAXONOMY" >&2

echo "" >&2

PrintTime() {

  if [ "$1" = true ]
  then
    echo "$2 $3 seconds" >&2
  fi
}

PrintTime "$datafiles" "DAT" "$DAT"
PrintTime "$download" "DWN" "$DWN"
PrintTime "$populate" "POP" "$POP"

PrintTime "$e2index" "IDX" "$IDX"
PrintTime "$e2invert" "INV" "$INV"
PrintTime "$e2collect" "COL" "$COL"
PrintTime "$e2merge" "MRG" "$MRG"
PrintTime "$e2post" "PST" "$PST"

echo "" >&2

function PrintTotalElapsedTime {
  local L=$1
  local T=$2
  local D=$((T/60/60/24))
  local H=$((T/60/60%24))
  local M=$((T/60%60))
  local S=$((T%60))
  printf '%s %d second' "$L" $T 1>&2
  (( $T > 1 )) && printf 's' 1>&2
  if [ "$T" -gt 59 ]
  then
    printf ', or' 1>&2
    (( $D > 0 )) && printf ' %d day' $D 1>&2
    (( $D > 1 )) && printf 's' 1>&2
    (( $H > 0 )) && printf ' %d hour' $H 1>&2
    (( $H > 1 )) && printf 's' 1>&2
    (( $M > 0 )) && printf ' %d minute' $M 1>&2
    (( $M > 1 )) && printf 's' 1>&2
    (( $S > 0 )) && printf ' %d second' $S 1>&2
    (( $S > 1 )) && printf 's' 1>&2
  fi
  printf '\n' 1>&2
}

total_end=$(date "+%s")
total=$((total_end - total_start))
TOT=$total
PrintTotalElapsedTime "TOT" "$TOT"
echo "" >&2
