# This script can be used to process downloaded VISTA data
# VISTA is a bit strange since they don't provide their results in a BED format
# This script will process the results copy-pasted from the website

# The goal is to get the hg19 coordinates of all sequences that tested "Positive" in the VISTA database, whether the sequence originated from mouse (mm) or human (hs)

# Go to https://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=search&show=1&search.form=no&search.result=yes
# Press Command-A and paste it into a text editor and save as a .txt file
# In this case I did this and saved it as 2019-12-4_VISTA-copypaste.txt

# Here is a 1-liner to process it
cat 2019-12-4_VISTA-copypaste.txt | tail -n +8 | sed '$d' | sed '$d' | awk '$6!=""' | grep "Positive" | awk 'BEGIN{FS=" ";OFS="\t"};{print $2,$1,$3}' | perl -p -e 's/,//g' | perl -p -e 's/:|-| /\t/g' | awk 'BEGIN{FS=OFS="\t"};{print $1,$2,$3,$4,$5","$6}' | sed 's/,$//g' | sort -k1,1 -k2,2n > 2019-12-4_VISTA-positive.bed

# Intersect with SNPs with a PIP >= 0.1 and overlap with an enriched cell pop
bedtools intersect -wa -wb -a 2019-12-4_VISTA-positive.bed -b pip.overlap.snps.bed > 2019-12-4_VISTA-overlaps.txt

# Make file that will allow for uploading to R for locus visualization
awk 'BEGIN{FS=OFS="\t"};{print $1,$2,$3,$4}' 2019-12-4_VISTA-positive.bed > VISTA-positive-R.bed