#! /bin/csh -f
#
#
#	crosscheck.com 
#
#	script for cross-checking potential new heavy atom sites
#	with a list of "known" sites and/or a Patterson map
#	
#	sites can be entered explicitly, or picked from a phased
#	difference Fourier map.
#
# set this to wherever your awk program is
set nawk = nawk
nawk 'BEGIN{print}' >& /dev/null
if($status) set nawk = awk
alias nawk $nawk

set outfile	    = ./crosscheck.list
set logfile	    = ./crosscheck.log
set Patterson_map  = ./best_Patt.map
set Fourier_map    = ./best_phased_diff.map
set sitefile        = ""

set tempfile   = ./crosscheck_temp

# default cutoff values
set hiRES        = 2.7	    # upper resolution limit of map(s)
set loRES        = 1000     # lower resolution cutoff for map-making
set SG		 = ""	    # space group (set by phased map)
set CELL	 = ""	    # unit cell (also read from map)

set PICK_cutoff  = 3	    # sigma cutoff for peak-picking
set CLOSE_peaks  = ""	    # default: 1/3 * $hiRES

# used at the end
set type = "  old"
# RSPS limit (change this if you recompile RSPS)
set MAXPAT = 800000

# no args, print help
if($#argv == 0) goto Help

# this procedure (re)sets most of the above variables
# from either the provided files, or the command line
goto Setup

Help:
cat << EOF

usage: $0 phased_diff.map Patterson.map oldsites.txt 2.7A 3 sigma P212121

where:
phased_diff.map - a phased difference Fourier
Patterson.map 	- a Patterson to check sites against
2.7A		- desired resolution cutoff
3 sigma 	- sigma cutoff for peak-picking
P212121		- your space group (if only Patterson.map is given)
oldsites.txt	- a list of fractional coordinates of "known" sites
		  (can be almost any format, a pdb, mlphare script, or
		   shelx .res file will do)

$0 will:

Pick peaks > (sigma cutoff) in the phased difference Fourier
or just use the provided peaks in "sitefile"

Any picked peaks within (1/3 resolution)A of a "known" site 
(from oldsites.txt) are ignored, as are "map edge sites" and
extra symmetry mates.

The remaining peaks are then cross-checked with the Patterson map.  
Here, the "score" is the average of the Patterson map values at every 
predicted vector (see manual for rsps).  In addition, the expected
cross-peaks to the "known" sites are scored.

If no Patterson is provided, it will be calculated from the phased_diff.map.

A sorted list of all these sites is then printed out in:
$outfile 

with the format:             
 frac  Angsrm  nearest   phased map  Patterson score
x y z   X Y Z  old peak  height/sig  harker cross
| | |   | | |  |             |        |      |

What you do with this list is up to you. :)

EOF

exit
#
#   This procedure (at the bottom of the script) does the following
#   1) scan the command line for mtz and mlphare script files
#   2) set the CELL, SG, and hiRES variables
#   3) generate dataset name lists: ${tempfile}Fs and ${tempfile}Ds 
#   3) generate "known site" lists: ${tempfile}oldsites_* ${tempfile}badsites
Return_from_Setup:

# make sure we have something to work with
if((! -e "$sitefile")&&(! -e "$Fourier_map")) then
    # no way to get sites
    echo "ERROR: no sites to check! "
    echo "       please specify either a map or a filename containing"
    echo "	 your metal sites."
    goto Help
endif
if("$SG" == "") then
    # must have given a Patterson, but no space group
    echo "ERROR: we don't have a space group. "
    echo ""
    echo "please run:"
    echo "$0 P212121 $* "
    echo "or whatever your space group is."
    echo ""
    goto Help
endif


################################################################
#   initial report on intended program flow
################################################################
# start the logfile
echo "" >! $logfile
if(-e "$Fourier_map") then
    echo "Looking for sites > ${PICK_cutoff}*sigma in $Fourier_map" | tee -a $logfile

    if(-e "$sitefile") then
	set oldsites = `cat ${tempfile}oldsites_reduced | wc -l`
	set temp     = `cat ${tempfile}badsites | wc -l`
	if($temp != 0) set oldsites = "$oldsites (plus $temp bad)"
	
	echo "that are not already within ${CLOSE_peaks}A" | tee -a $logfile
	echo "of the $oldsites atoms in $sitefile" | tee -a $logfile
    endif
else
    if(-e "$sitefile") then
	echo "the $oldsites sites in $sitefile will be checked" | tee -a $logfile
	if(-e "$Patterson_map") then
	    echo "against $Patterson_map" | tee -a $logfile
	else
	    # should never get here
	    goto Help
	endif
	echo "" | tee -a $logfile
	
	# no need to pick peaks
	goto RSPS
    endif
endif


Pick_Peaks:
################################################################

 #####      #     ####   #    #
 #    #     #    #    #  #   #
 #    #     #    #       ####
 #####      #    #       #  #
 #          #    #    #  #   #
 #          #     ####   #    #

################################################################
#   pick peaks in the difference Fourier to find new sites
################################################################

# redundant catch
if(! -e "$Fourier_map") then
    # uhh, assume we have some peaks
    goto RSPS
endif

echo ""

# avoid roundoff errors in height / confusions about sigma
# re-scale map so sigma = 1
# and extend it out just a bit (for peak-picking)
mapmask MAPIN $Fourier_map \
 MAPOUT ${tempfile}norm.map << EOF-extend >> $logfile
SCALE SIGMA
XYZLIM -0.1 1.1 -0.1 1.1 -0.1 1.1
EOF-extend


# pick peaks in the extended, normalized map
set temp = `echo "$PICK_cutoff" | nawk '$1+0>=0{print $1} $1+0<0{print -$1, "NEGATIVES"}'`
peakmax MAPIN ${tempfile}norm.map \
  TO ${tempfile}.xyz PEAKS ${tempfile}.xyz << eof-pick >> $logfile
THRESHOLD $temp
#NUMPEAKS 20
OUTPUT PEAKS
END
eof-pick
if($status) then
    set temp = `echo "$PICK_cutoff" | nawk '{printf "%d", $1-1}'`
    if("$temp" > 0) then
	echo "reducing pick cutoff to $temp sigma"
	set PICK_cutoff = "$temp"
	goto Pick_Peaks
    endif
    
    # otherwise, we're just screwed
    echo "unable to pick peaks in $Fourier_map"
    echo "sorry! "
    goto Finish
endif

echo -n "analyzing peaks "

# condense peak data to essential list 
# and remove picks outside the cell 
# (we don't need them, and they could be map-edge picks)
cat ${tempfile}.xyz |\
nawk 'NF>6 && /[^1-9.-]/{ \
print substr($0,23,8), substr($0,31,8), substr($0,39,8), substr($0,6,8)}' |\
nawk '$1+0>0 && $1+0<1 && $2+0>0 && $2+0<1 &&$3+0>0 && $3+0<1' |\
sort -nr +3 >! ${tempfile}peaks
#
# format is xf yf zf height/sigma
#

# clean up
rm -f ${tempfile}.xyz     >& /dev/null
rm -f ${tempfile}norm.map >& /dev/null


################################################################
#   FILTER OUT DUPLICATE SITES
################################################################

# because we picked a whole unit cell above (and then some)
# we need to remove symmetry mates from the peak list

# first, generate ALL symmetry-equivalent positions for the picked peaks
cat << EOF >! ${tempfile}gensym.in
SYMM $SG
CELL $CELL
EOF
cat ${tempfile}peaks |\
nawk '{++n; print "RESIDUE",n; print "ATOM X", $1, $2, $3}' |\
cat >> ${tempfile}gensym.in
cat ${tempfile}gensym.in | gensym |&\
nawk '/List of sites/,/Normal termination/' |\
nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1)}' |\
cat >! ${tempfile}peaks_symm
rm -f ${tempfile}gensym.in >& /dev/null

#  ${tempfile}peaks_symm is now an indexed list of all 
# symmetry-related peak positions in the unit cell

# record peak heights (since they didn't survive gensym)
cat ${tempfile}peaks |\
nawk '{++n; print n, $NF}' |\
cat >! ${tempfile}peak_heights

# now filter the symmetry-expanded list for the
# unique list of non-symmetry related peaks
cat ${tempfile}peak_heights ${tempfile}peaks_symm |\
nawk -v cut=$CLOSE_peaks 'NF==2{height[$1]=$2}\
    NF>3{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; group[n]=$NF;\
	for(i=1;i<n;++i){\
	    dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\
	    # see if this site has already been printed \
	    if(dist < cut){ ++taken[group[n]];}}; \
	if(! taken[group[n]]) print $1, $2, $3, $4, $5, $6, height[group[n]];\
	# register all the symm mates as taken too \
	++taken[group[n]]}' |\
cat >! ${tempfile}peaks

###################################
# now ${tempfile}peaks is the list of symmetry-unique map peaks
# (without any map-edge picks)
# in the format xf yf zf X Y Z height/sigma

# clean up
rm -f ${tempfile}peak_heights >& /dev/null
rm -f ${tempfile}peaks_symm   >& /dev/null


################################################################
#   FILTER OUT UNINTERESTING SITES
################################################################

if(! -e ${tempfile}oldsites_original) touch ${tempfile}oldsites_original
if(! -e ${tempfile}badsites) touch ${tempfile}badsites

# symmetry-expand the "known" and "known bad" sites
cat << EOF >! ${tempfile}gensym.in
SYMM $SG
CELL $CELL
XYZLIM -0.2 1.2 -0.2 1.2 -0.2 1.2 
EOF
cat ${tempfile}oldsites_original ${tempfile}badsites |\
nawk '{print "ATOM X", $1, $2, $3}' |\
cat >> ${tempfile}gensym.in
cat ${tempfile}gensym.in | gensym |&\
nawk '/List of sites/,/Normal termination/' |\
nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, "known"}' |\
cat >! ${tempfile}all_known_sites
rm -f ${tempfile}gensym.in >& /dev/null

# ${tempfile}all_known_sites now contains ALL symmetry-equivalent
# positions to the sites the user entered as "known" or "bad"


set newsites = `cat ${tempfile}peaks | wc -l`
echo -n "$newsites found > ${PICK_cutoff}*sigma, "

# filter out peaks within $CLOSE_peaks Angstroms of old or bad sites
# AND add the distance to the nearest
cat ${tempfile}all_known_sites ${tempfile}peaks |\
nawk -v cut=$CLOSE_peaks ' \
      $NF=="known"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6} \
      $NF!="known"{minD=999999; \
         # find nearest "old" site \
         for(i=1;i<=n;++i){\
	     dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\
	     if(dist < minD){\
	     minD=dist;}}\
	 # now see if it is too close \
	 if(minD > cut) {print}}' |\
sort -nr +6 >! ${tempfile}
mv ${tempfile} ${tempfile}peaks

# don't need this anymore
rm -f ${tempfile}badsites >& /dev/null
rm -f ${tempfile}all_known_sites >& /dev/null


###################################
# ${tempfile}peaks still has the format:
#   xf yf zf X Y Z height/sigma
#

set newsites = `cat ${tempfile}peaks | wc -l`
echo "($newsites new)."


Patterson:
################################################################

 #####     ##     #####   #####  ######  #####    ####    ####   #    #
 #    #   #  #      #       #    #       #    #  #       #    #  ##   #
 #    #  #    #     #       #    #####   #    #   ####   #    #  # #  #
 #####   ######     #       #    #       #####        #  #    #  #  # #
 #       #    #     #       #    #       #   #   #    #  #    #  #   ##
 #       #    #     #       #    ######  #    #   ####    ####   #    #

################################################################
#   make Patterson,  if we don't have it
################################################################

if(-e "$Patterson_map") then
    # we don't need to calculate it then
    goto RSPS
endif

if(! -e "$Fourier_map") then
    # should never get here either
    echo "ERROR: no maps available! "
    goto Help
endif

# get maximum possible resolution (to fit into RSPS)
set PattRES = `echo $CELL | nawk -v pts=$MAXPAT '{printf "%.2f\n", 3*(($1 * $2 * $3)/pts)^(1/3)}'`
set PattRES = `echo "$PattRES $hiRES" | nawk '$1>$2{print $1} $2>$1{print $2}'`

# use sfall to back-transform the difference map
echo ""
echo "making Patterson from $Fourier_map at $PattRES A"

# overdrive the sampling a bit
set temp = `echo $mapRES | nawk '{print 0.9*$1}'`
sfall MAPIN $Fourier_map HKLOUT ${tempfile}reversed.mtz << EOF-sfall >> $logfile
SYMM $SG
RESOLUTION 1000 $temp
MODE SFCALC MAPIN
END
EOF-sfall
if($status) then
    # hmm, maybe resolution is too high?
    echo "GOTHERE"
    exit
endif

# re-transform as a Patterson

Patt_again:
# calculate the Patterson map
fft HKLIN ${tempfile}reversed.mtz MAPOUT ${tempfile}_Patt.map << EOF-fft >> $logfile
RESOLUTION $loRES $PattRES
PATTERSON
title  Patterson version of $Fourier_map
LABIN F1=FC
END
EOF-fft

# check to make sure it's not too big
set size = `ls -l ${tempfile}_Patt.map | nawk '{printf "%d\n", $5/4}'`
if("$size" > $MAXPAT) then
    # this is not going to work in RSPS
    # make it smaller
    
    # crude cell volume
    set PattRES = `echo $PattRES | nawk '{print 1.1*$1}'`
    
    echo "woops, map too big for rsps, reducing resolution to $PattRES."
    goto Patt_again
endif

# clean up
rm -f ${tempfile}reversed.mtz >& /dev/null
set Patterson_map = ${tempfile}_Patt.map


RSPS:
################################################################

 #####    ####   #####    ####
 #    #  #       #    #  #
 #    #   ####   #    #   ####
 #####        #  #####        #
 #   #   #    #  #       #    #
 #    #   ####   #        ####

################################################################
#   use RSPS to get a score for every putative site with the Patterson
################################################################
if(! -e ${tempfile}peaks) touch ${tempfile}peaks
if(! -e ${tempfile}oldsites_reduced) touch ${tempfile}oldsites_reduced
set newsites = `cat ${tempfile}peaks | wc -l`
set oldsites = `cat ${tempfile}oldsites_reduced | wc -l`

echo ""
set type = "other"
# decide what kind of site data we have
if($oldsites > 0) then
    if($newsites > 0) then
	# both old and new sites avaialble, so check the
	# new sites for crossvectors to the old sites
	echo "checking $newsites new sites against $oldsites old sites on Patterson"
	##### normal behavior #######
	set type = "  old"
    else
	# we have old sites, but no new ones
	if(-e "$Fourier_map") then
	    # we DID, however, pick peaks 
	    # they must have all been filtered out
	    echo "no sites to check against Patterson"
	    goto Finish
	endif
	
	# otherwise, score the provided "old" sites against themselves in
	# the crossvector search
	echo "checking $oldsites sites against the Patterson"
	
	# convert "old site" file to a peak-pick file
	cat ${tempfile}oldsites_reduced |\
	nawk '{print $1, $2, $3, $4, $5, $6, "?.??"}' |\
	cat >! ${tempfile}peaks
    endif
else
    # $oldsites == 0
    if($newsites > 0) then
	# we have new sites, but no old ones
	# set up "new" as "old" so we can report some kind of
	# cross-vector score
	echo "checking $newsites new peaks against the Patterson"
	
	# convert picked peaks to an "old site" file
	cat ${tempfile}peaks |\
	nawk '{++n; print $1, $2, $3, $4, $5, $6, "?", "site"}' |\
	cat >! ${tempfile}oldsites_reduced
	
	# backtrack to "original" file too
	cat ${tempfile}oldsites_reduced |\
	nawk '{print $1, $2, $3}' |\
	cat >! ${tempfile}oldsites_original
    else
	# no sites at all!
	echo "no sites to check against the Patterson"
	goto Finish
    endif
endif

# now, for each "new" site calculate the distance to
# the nearest "other" site 
# new-old if we have both
# new-new if we had no old sites
# old-old if no new peaks were found

# symmetry-expand the "old" sites
cat << EOF >! ${tempfile}gensym.in
SYMM $SG
CELL $CELL
XYZLIM -0.2 1.2 -0.2 1.2 -0.2 1.2 
EOF
cat ${tempfile}oldsites_original |\
nawk '{print "RESIDUE", NR; print "ATOM X", $1, $2, $3}' |\
cat >> ${tempfile}gensym.in
cat ${tempfile}gensym.in | gensym |&\
nawk '/List of sites/,/Normal termination/' |\
nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1), "sym"}' |\
cat >! ${tempfile}oldsites_sym
rm -f ${tempfile}gensym.in >& /dev/null

# label "old" sites that are really "new"
if($oldsites == 0) then
    # "old" sites are really "new" sites, indicate this in the file
    cat ${tempfile}oldsites_sym |\
    nawk '{$7= $7 "?"; print}' |\
    cat >! ${tempfile}
    mv ${tempfile} ${tempfile}oldsites_sym
endif

# measure the distances
cat ${tempfile}oldsites_sym ${tempfile}peaks |\
nawk '# record "old" site positions \
  $NF=="sym"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; site[n]=$7} \
  $NF!="sym"{++j; minD=999999; for(i=1;i<=n;++i){ \
    # find nearest "old" site \
    dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\
    # only consider distances not flagged as same site \
    if(site[i]+0!=j || (site[i] !~ /[?]/ && $0 !~ /[?]/)) \
      if(dist < minD) minD=dist}\
    # tack this distance to the end of the line \
    print $0, minD}' |\
cat >! ${tempfile}
mv ${tempfile} ${tempfile}peaks

rm -f ${tempfile}oldsites_sym >& /dev/null

# ${tempfile}peaks now has the format:
# xf yf zf X Y Z height/sigma dist


harker:
################################################################
#   check sites against harker sections
################################################################
cat << EOF >! ${tempfile}rspsin
spacegroup $SG
patfile $Patterson_map
scorfile ${tempfile}rsps.map
reject 0
mode harker
scan au
EOF

# ask for harker score of each peak
cat ${tempfile}peaks |\
nawk '{print "vlist", $1, $2, $3}' |\
cat >> ${tempfile}rspsin

# get harker scores
cat ${tempfile}rspsin | rsps | tee -a $logfile |\
nawk '/Rms deviation from mean/{rms=$NF; if(rms<=0) rms=1}\
$1=="SCORE"{++n; print n, $3, $3/rms}' |\
cat >! ${tempfile}harker_score

# check that it worked
set temp = `cat ${tempfile}harker_score | wc -l`
if($temp == 0) then
    echo "RSPS failed! "
    echo "sorry! look in $logfile to see what went wrong."
    goto Finish
endif

################################################################
#   filter "fixxyz" sites before doing cross score? 
################################################################
# get cross-vector score to known peaks (if any)
# but first, check that RSPS can handle the load
set oldsites = `cat ${tempfile}oldsites_reduced | wc -l`
if($oldsites > 30) then
    # we have a problem: RSPS can't hold more than 30 fixxyz sites
    echo "WARNING: RSPS can't handle $oldsites reference sites"
    echo "         cross-validation will be limited to 30 $sitefile sites"
    echo "         which map to the heightst harker peaks."
    
    cat << EOF >! ${tempfile}rspsin
    spacegroup $SG
    patfile $Patterson_map
    scorfile ${tempfile}rsps.map
    reject 0
    mode harker
EOF
    # ask for harker score of "old" sites
    cat ${tempfile}oldsites_reduced |\
    nawk '{print "vlist", $1, $2, $3}' |\
    cat >> ${tempfile}rspsin
    
    # run RSPS
    cat ${tempfile}rspsin | rsps |& tee -a $logfile |\
    nawk '$1=="SCORE"{++n; print n, $3}' |\
    sort -nr +1 | head -30 >! ${tempfile}pattscore
    
    # screen these top peaks out of the original sites
    cat ${tempfile}pattscore ${tempfile}oldsites_reduced |\
    nawk 'NF==2{hscore[$1]=$2} \
          NF>6{++n; if(hscore[n]) print "fixxyz", $1, $2, $3}' |\
    cat >! ${tempfile}fixxyz
else
    # okay to use all old sites
    cat ${tempfile}oldsites_reduced |\
    nawk '{print "fixxyz", $1, $2, $3}' |\
    cat >! ${tempfile}fixxyz
endif

################################################################
#   check sites against cross peaks to known sites
################################################################
# now make RSPS input for cross-vector search
cat << EOF >! ${tempfile}rspsin
spacegroup $SG
patfile $Patterson_map 
scorfile ${tempfile}rsps.map
reject 0
mode cross
EOF

# retrieve "fixed" known sites (even if they are also the querry sites? )
cat ${tempfile}fixxyz >> ${tempfile}rspsin
rm -f ${tempfile}fixxyz >& /dev/null

# ask for cross-vector score to old peaks
echo "scan au" >> ${tempfile}rspsin
cat ${tempfile}peaks |\
nawk '{print "vlist", $1, $2, $3}' |\
cat >> ${tempfile}rspsin

    
# get crosspeak scores
cat ${tempfile}rspsin | rsps |& tee -a $logfile |\
nawk '/Rms deviation from mean/{rms=$NF; if(rms<=0) rms=1}\
$1=="SCORE"{++n; print n, $3, $3/rms}' |\
cat >! ${tempfile}cross_score

    
# clean up
rm -f ${tempfile}rspsin   >& /dev/null
rm -f ${tempfile}rsps.map >& /dev/null
# only if user didn't want it
rm -f ${tempfile}bestPatterson.map >& /dev/null


# assign Patterson score to data in files
cat ${tempfile}harker_score |\
nawk '{score[$1] += $3} \
 END{for(i in score) print i, score[i], "harker"}' |\
sort -n >! ${tempfile}pattscore

cat ${tempfile}cross_score |\
nawk '{score[$1] += $3} \
 END{for(i in score) print i, score[i], "cross"}' |\
sort -n >> ${tempfile}pattscore


Finish:
#################################################

 ######     #    #    #     #     ####   #    #
 #          #    ##   #     #    #       #    #
 #####      #    # #  #     #     ####   ######
 #          #    #  # #     #         #  #    #
 #          #    #   ##     #    #    #  #    #
 #          #    #    #     #     ####   #    #

#################################################
#
# put all these scores into the same file
#
#################################################
if(! -e "${tempfile}pattscore") touch ${tempfile}pattscore
if(! -e "${tempfile}peaks") touch ${tempfile}peaks


# put all scores on the same line
cat ${tempfile}pattscore ${tempfile}peaks |\
nawk '$NF=="harker"{hscore[$1]=$2} \
      $NF=="cross" {cscore[$1]=$2} \
      NF>6{++n; print $0, hscore[n], cscore[n]}' |\
cat >! ${tempfile}scores


echo "" >! $outfile
echo "fractional  coordinates       Angstrom    coordinates      nearest     height/sig         Patterson map score  " >> $outfile
echo "  x        y        z          X        Y        Z      $type site     on Fourier         harker         cross " >> $outfile
cat ${tempfile}scores |\
nawk '{printf "%8.5f %8.5f %8.5f %8.3f %8.3f %8.3f   %8.2fA %14s    %14.5f %14.5f\n",\
               $1, $2, $3, $4, $5, $6, $8, $7, $9, $10}' |\
sort +7nr -8 +8nr -9 +9nr -10 >> $outfile
set newsites = `cat ${tempfile}scores | wc -l`
if($newsites > 0) then
    echo ""
    echo "all peaks and scores written to $outfile"
endif


# now figure out what to do with it! 


# clean up
Clean_up_sites:


rm -f ${tempfile}harker_score >& /dev/null
rm -f ${tempfile}cross_score  >& /dev/null
rm -f ${tempfile}pattscore    >& /dev/null
rm -f ${tempfile}oldsites_original >& /dev/null
rm -f ${tempfile}oldsites_reduced  >& /dev/null
rm -f ${tempfile}badsites     >& /dev/null
rm -f ${tempfile}scores >& /dev/null
rm -f ${tempfile}peaks  >& /dev/null
rm -f ${tempfile}_Patt.map >& /dev/null


exit


Setup:
#################################################

  ####   ######   #####  #    #  #####
 #       #          #    #    #  #    #
  ####   #####      #    #    #  #    #
      #  #          #    #    #  #####
 #    #  #          #    #    #  #
  ####   ######     #     ####   #

#################################################
#
#   gather information on:
#    map files
#    known sites
#    "bad" sites
#    resolution limit
#
##################################################

goto Unwrap_Awk_Scripts
# deploy ${tempfile}sitereader.awk
Return_Unwrap_Awk_Scripts:

set mapfiles = ""

# scan the command line for files
echo -n "" >! ${tempfile}oldsites_original
echo -n "" >! ${tempfile}oldsites_reduced
echo -n "" >! ${tempfile}badsites
echo -n "" >! ${tempfile}peaks
foreach arg ( $* )
    # warn about probable mispellings
    if((! -e "$arg")&&(($arg =~ *.pdb)||($arg =~ *.map))) then
	echo "WARNING: $arg does not exist"
    endif
    if(-e "$arg") then
	if("$arg" =~ *.pdb) then
	    set pdbfile = "$arg"
	    continue
	endif
	
	if("$arg" =~ *.map) then
	    set mapfiles = "$mapfiles $arg"
	    continue
	endif
	
	# some kind of file, look for fractional coordinates
#	set temp = `file $arg | nawk '{print $NF}'`
#	if("$temp" == "text") then
	set temp = `ls -lLd $arg |& nawk '/^-/{print $5+0}' | head -1`
	if(("$temp" != "")&&("$temp" != "0")) then
	    if($temp > 10000) then
		# this is too big
		continue
	    endif
	    # general XYZ coordinate extraction
	    cat $arg |\
	    nawk '{for(i=1;i<=NF;++i){\
	        if((($i ~ /^[01].[0-9][0-9][0-9]/)||\
	            ($i ~ /^-[01].[0-9][0-9][0-9]/))&&\
	    	($i+0>=-1.1)&&($i+0<=1.1))\
	                {++coords}else{coords=0};\
	        if(coords==3)\
	        {print $(i-2), $(i-1), $i; coords=0; i=NF}}}' |\
	    cat >! ${tempfile}
	    set temp = `cat ${tempfile} | wc -l`
	    if($temp > 0) then
		# might as well use it
		cp ${tempfile} ${tempfile}oldsites_original >& /dev/null
		set sitefile = "$arg"
	    endif
	    rm -f ${tempfile} >& /dev/null
	else
	    echo "${arg} is a "\"$temp\"" file.  We need text."
	endif
    endif
end


# user specified a map, Patterson or Fourier?
foreach map ( $mapfiles )
    # have a look at the map header
    echo "" |\
    mapdump MAPIN $map |\
    cat >! ${tempfile}mapheader
	
    set mapSG = `nawk '$1=="Space-group"{print $NF}' ${tempfile}mapheader`
    if("$mapSG" == "") then
	echo "ERROR: $map is unreadable! Ignoring $map"
	continue
    endif
    set mapSG = `nawk -v SG="$mapSG" '$1==SG{print $4}' $CLIBD/symop.lib`
    if("$mapSG" == "") then
	echo "ERROR: $map is in an unknown space group! Ignoring $map"
	continue
    endif
    
    # set variables from this map
    set CELL = `nawk '/Cell dimensions/{print $4, $5, $6, $7, $8, $9}' ${tempfile}mapheader`
    set hiRES = `nawk '/Grid sampling/{grid=$8} /Cell dimensions/ && grid+0>0{print 3*$4/grid}' ${tempfile}mapheader`
	
    if(("$mapSG" =~ *[abcdmn]*)||("$mapSG" =~ *-*)) then
	# this must be a Patterson
	set Patterson_map = "$map"
	set userPatt = "$map"
	
	# guess the original space group?
	
	continue
    else
	# must be a real-space map (from same structure?)
	set Fourier_map = "$map"
	set userFourier = "$map"
	set SG = "$mapSG"
	set mapRES = "$hiRES"
	continue
    endif
	
    # got here? how?
    echo "ERROR: unable to use $map"
    echo "       couldn't figure out what kind of map it is."
end

rm -f ${tempfile}mapheader >& /dev/null

# now all filenames have been initialized
if((! -e "$Fourier_map")&&(! -e "$Patterson_map")) then
    # cannot proceed with NO maps
    # it's not our business to calculate them from mtzs
    echo "ERROR: no maps specified! "
    echo " nothing for us to do! "
    goto Help
endif


# space group from pdb?
if($?pdbfile) then
    # atoms were entered as a PDB
    set sitefile = $pdbfile
        
    nawk '/^CRYST/{print substr($0, 55, 12); exit}' $pdbfile |\
    nawk '{print $1, $2, $3, $4}' |\
    nawk '/^[PCIFHR]/{printf "%s", $1; for(i=2;i<=NF;++i){\
	if((($i !~ /^[1-6]$/)&&($i !~ /^[1-6][1-5]$/))||\
	(substr($i,1,1)+0 < substr($i,2,1)+0)){exit}; printf $i}} \
	END{print ""}' |\
    cat >! ${tempfile}
    set pdbSG = `cat ${tempfile}`
    rm -f ${tempfile} >& /dev/null
    
    if("$pdbSG" != "") then
	# got something, check it:
	set temp = `nawk -v SG="$pdbSG" '$4==SG{print $4}' $CLIBD/symop.lib`
	if("$temp" != "") then
	    # use it
	    set SG = "$pdbSG"
	else
	    echo "WARNING: $pdbSG from $pdbfile is not a real space group."
	endif
    endif

    # may as well check the cells
    echo "MAP $CELL" |\
      nawk '{printf "MAP %8.3f %8.3f %6.1f %6.1f %6.1f %6.1f\n", $2, $3, $4, $5, $6, $7}' |\
    cat >! ${tempfile}cell
    nawk '/^CRYST/' $pdbfile |\
      nawk '{printf "PDB %8.3f %8.3f %6.1f %6.1f %6.1f %6.1f\n", $2, $3, $4, $5, $6, $7}' |\
    cat >> ${tempfile}cell
    cat ${tempfile}cell |\
    nawk 'NR==1{for(i=2;i<=NF;++i){c[i]=$i;}}\
          NR==2{for(i=2;i<=NF;++i){d=sqrt(($i-c[i])^2); if(maxd<d)maxd=d;}}\
	  END{printf "%d", maxd*100}' |\
    cat >! ${tempfile}
    set temp = `head -1 ${tempfile}`
    if("$temp" == "") set temp = 0
    rm -f ${tempfile}
    if($temp > 3) then
	echo "WARNING: unit cells"
	cat ${tempfile}cell
	echo "are kind of different! "
    endif
    rm -f ${tempfile}cell
endif


# grab other things from the mlphare script?
if(-e "$sitefile") then
    # get resolution
    set temp = `nawk '/^RESO/ && $NF+0 >0.1 && $NF+0 < 6 {print $NF}' $sitefile`
    if("$temp" != "") then
	set hiRES = "$temp"
    endif
    
    # look for space group
    cat $sitefile |\
    nawk '{for(i=1;i<=length($0);++i){c=substr($0,i,1); \
        if(c !~ /[PpCcIiFfRr1-6]/){print ""}else{printf "%s", c}}print ""}' |\
    nawk '/^[PpCcIiFfRr][1-6]/ && substr($1,2) !~ /[A-Z]/' |\
    cat >! ${tempfile}
    
    set scriptSG = `cat ${tempfile}`
    rm -f ${tempfile} >& /dev/null
    
    if("$scriptSG" != "") then
	# got something, check it:
	set temp = `nawk -v SG="$scriptSG" '$4==SG{print $4}' $CLIBD/symop.lib`
	if("$temp" != "") then
	    # use it
	    set SG = "$scriptSG"
	else
	    echo "WARNING: $scriptSG from $sitefile is not a real space group."
	endif
    endif
endif


# grab something from shelx file?


# one last pass through command line
# allow user overrides of all internal variables
set i = 0
while( $i < $#argv )
    @ i = ( $i + 1 )
    @ nexti = ( $i + 1 )
    @ lasti = ( $i - 1 )
    if($nexti > $#argv) set nexti = $#argv
    if($lasti < 1) set lasti = 1
    set arg = "$argv[$i]"
    

    if(! -e "$arg") then
	# space group
	if("$arg" =~ [PpCcIiFfRr][1-6]*) then
	    set temp = `echo $arg | nawk '{print toupper($1)}'`
	    if($?CLIBD) then
		set temp = `nawk -v SG=$temp '$4 == SG {print $4}' $CLIBD/symop.lib | head -1`
	    endif
	    if("$temp" != "") then
		# add this SG to the space group list
		set SG = "$temp"
	    endif
	endif


	# see if a dataset label was given		
	if("$arg" =~ [0-9]*) then
	    # we have a number
	    if(("$arg" =~ *A)||("$argv[$nexti]" == "A")) then
		# user-preferred resolution limits
		set temp = `echo "$arg" | nawk 'BEGIN{FS="-"} $1+0 > 0.1{print $1+0} $2+0 > 0.1{print $2+0}'`
		if($#temp != 1) then
		    set temp = `echo $temp | nawk '$1>$2{print $1, $2} $2>$1{print $2, $1}'`
		    if($#temp == 2) then
			set loRES = "$temp[1]"
			set hiRES = "$temp[2]"
		    endif
		else
		    if("$temp" != "") set hiRES = "$temp"
		endif
	    endif
	    
	    if(("$arg" =~ *[Ss]igma)||("$argv[$nexti]" =~ [Ss]igma)) then
		set PICK_cutoff = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'`
	    endif
	    
	    if(("$arg" =~ *[Dd]iff)||("$argv[$lasti]" =~ [Dd]iff)) then
		set MAX_diso = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'`
		set MAX_dano = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'`
	    endif
	    if(("$arg" =~ *[Dd]iso)||("$argv[$lasti]" =~ [Dd]iso)) then
		set MAX_diso = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'`
	    endif
	    if(("$arg" =~ *[Dd]ano)||("$argv[$lasti]" =~ [Dd]ano)) then
		set MAX_dano = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'`
	    endif
	endif
	
	# allow "NO" logic to carry through
	unset NO
	if(("$arg" == "no")||("$arg" == "not")) set NO
	if(("$arg" == "don't")||("$arg" == "ignore")) set NO
	if("$arg" == "except") set NO
    endif
end

# calculate expected spacial discrimination limit as 1/3 resolution limit
if("$CLOSE_peaks" == "") then
    set CLOSE_peaks = `echo "$hiRES" | nawk '{printf "%.2f", $1/3}'`
endif


##########################################################
# now establish locations of "known" atoms
echo -n "" >! ${tempfile}badsites
if(-e "$sitefile") then
    # make sure we handle mlphare scripts properly
    grep mlphare $sitefile >& /dev/null
    if(! $status) then
	# only "ATOM" cards should be used
	nawk '$1~/^ATOM/{print $3, $4, $5}' $sitefile >! ${tempfile}
	set temp = `cat ${tempfile} | wc -l`
	if("$temp" != 0) then
	    # THESE are the atoms we want cross-peaks to
	    mv ${tempfile} ${tempfile}oldsites_original >& /dev/null
	endif
	
	# Phaser's "BADATOM" cards at bottom, unexecuted end of the script
	cat $sitefile |\
	nawk '$1~/^BADATOM/{print $3, $4, $5}' |\
	cat >! ${tempfile}badsites
	# ${tempfile}badsites may or may not have anything in it
    endif
    
    # sites were already extracted into ${tempfile}oldsites_original
    # during command-line scan (or directly above)
    
endif

if($?pdbfile) then
    
    # use coordconv to get fractional coordinates of these atoms
    coordconv XYZIN $pdbfile XYZOUT ${tempfile}oldsites.xyz << EOF-conv > /dev/null
    INPUT PDB
    CELL $CELL
    OUTPUT FRAC
    END
EOF-conv
    nawk '{print $2, $3, $4}' ${tempfile}oldsites.xyz |\
    cat >! ${tempfile}oldsites_original
    rm -f ${tempfile}oldsites.xyz >& /dev/null
endif

# at this point, whatever sites are avaiable have been
# read into ${tempfile}oldsites_original as fractional x y z


set inputsites = `cat ${tempfile}oldsites_original | wc -l`

# generate ALL symmetry-equivalent positions for ALL input sites
cat << EOF >! ${tempfile}gensym.in
SYMM $SG
CELL $CELL
XYZLIM -0.1 1.1 -0.1 1.1 -0.1 1.1 
EOF
cat ${tempfile}oldsites_original |\
nawk '{++n; print "RESIDUE",n; print "ATOM X", $0}' |\
cat >> ${tempfile}gensym.in
cat ${tempfile}gensym.in | gensym |&\
nawk '/List of sites/,/Normal termination/' |\
nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1), "old"}' |\
cat >! ${tempfile}oldsites_allsym
rm -f ${tempfile}gensym.in >& /dev/null


# now filter the symmetry-expanded list for the
# unique list of independent sites
cat ${tempfile}oldsites_allsym |\
nawk -v cut=$CLOSE_peaks '\
    $NF=="old"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; group[n]=$(NF-1); \
	for(i=1;i<n;++i){\
	    dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\
	    # see if this site has already been printed \
	    if(dist < cut){ ++taken[group[n]];}}; \
	if(! taken[group[n]]) print $1, $2, $3, $4, $5, $6, group[n], "site";\
	# register all the symm mates as taken too \
	++taken[group[n]]}' |\
cat >! ${tempfile}oldsites_reduced

# we can always regenerate the symm sites
rm -f ${tempfile}oldsites_allsym

# remember number of unique sites
set oldsites = `cat ${tempfile}oldsites_reduced | wc -l`


# ${tempfile}oldsites_reduced now contains the XYZ coordinates of
# each unique atom site in the $sitefile (or $pdbfile)
# in fractional and Angstrom units
# labelled as: xf yf zf X Y Z site# "site"

# ${tempfile}oldsites_original 
# will later be combined with bad sites to filter out
# "uninteresting" sites from the peak-pick
# formatted as: xf yf zf

# also, ${tempfile}badsites contains a list of "bad" sites 
# (as) written by Phaser Elves in their mlphare scripts.

# don't need this anymore
rm -f ${tempfile}sitereader.awk >& /dev/null

goto Return_from_Setup


Unwrap_Awk_Scripts:

cat << EOF-sitereader >! ${tempfile}sitereader.awk
#! $nawk -f
#
#
#	Retrieve arbitrarily formatted heavy-metal sites
#
#	looks for contiguous blocks of lines 
#	all of them containing three consecutive numbers written
#	to 3 or more decimal places and betwen -1.1 and +1.1
#
BEGIN{
    # estimate of B, to aid in finding real B-factors
    if(! B) B = 50
    if(! wilsonB) wilsonB = B
    if(! expectB) expectB = wilsonB
    
    if(! OCC) OCC = 1
    if(! expectOCC) expectOCC = OCC
}

# line must have at least 3 words

NF>=3{
    coords=0;
    for(i=1;i<=NF;++i)
    {
	# pattern to recognize fractional coordinates
	if(((\$i ~ /^[01].[0-9][0-9][0-9]/)||(\$i ~ /^-[01].[0-9][0-9][0-9]/))&&
	    (\$i+0>=-1.1)&&(\$i+0<=1.1))
	{
	    ++coords
	}
	else
	{
	    coords=0
	}
      
	# check for 3 consecutive hits \\
	if(coords==3)
	{
	    # this line is a site
	    ++site;
	    ++sites;
	    
	    XYZ[site] = ""
	    for(i=i-2;i<=NF;++i)
	    {
		# remember this line
		if(\$i !~ /[^0-9.-]/)
		{
		    XYZ[site] = XYZ[site] " " \$i;
		}
	    }

	    # reset site list with each new block
	    if(sites>site) sites = site;
	}
    }
    
    if(coords != 3)
    {
	# this was not a site, so reset the site counter for the
	# next block of sites
	site = 0;
    }
}

END {
    # bail if we found no sites
    if(! sites) exit

    # see if we can assign the rest of the numbers
    maxnums = 10
    for(site=1;site<=sites;++site)
    {
	nums=split(XYZ[site], num)
	
	# remember length of shortest list
	if(nums<maxnums) maxnums = nums;
	
	# compute the mean of each position
	for(i=4;i<=nums;++i)
	{
	    mean[i] += num[i];
	}
    }
    for(i=4;i<=maxnums;++i)
    {
	# finish computing the mean
	if(sites) mean[i]/=sites;
	
	# keep track of the column with biggest numbers
	if(mean[i]>mean[max_value]) max_value = i;
    }
    	
    # biggest B-factor dominates
    if(sqrt((mean[max_value]-expectB)^2)/expectB < 0.5)
    {
	BFAC_col=max_value;
    }
    
    # start on right side searching for B factor
    for(i=maxnums;i>3;--i)
    {
	# check each column against the expected B value
	if(sqrt((mean[i]-expectB)^2)/expectB < 0.5)
	{
	    # this column is within 50% of expected B factor
	    if(! BFAC_col) BFAC_col=i;
	}
    }
    
    # start at 4th position for occupancy
    for(i=4;i<=maxnums;++i)
    {
	# real occupancy could be positive or negative
	if((sqrt((sqrt(mean[i]^2)-expectOCC)^2)/expectOCC < 0.8)&&(i != BFAC_col))
	{
	    # this column is within 50% of expected occupancy value 
	    # (ignoring sign)
	    if(! OCC_col) OCC_col=i;
	}
	
	# anomalous occupancy is always positive, and usually after OCC
	if((sqrt((mean[i]-sqrt(expectOCC^2))^2)/expectOCC < 0.8) && (i != OCC_col) && (i != BFAC_col))
	{
	    # this column is within 50% of expected occupancy value
	    if(! AOCC_col) AOCC_col=i;
	}	
    }
    
    # now print out the last, contiguous block of sites
    for(site=1;site<=sites;++site)
    {
	# preliminary assignment of variables
	nums=split(XYZ[site], num)
	X   = num[1];
	Y   = num[2];
	Z   = num[3];
	OCC ="?.???";
	AOCC="?.???";
	BFAC="?.???";
	
	# assign values (if they are known)
	if( OCC_col)  OCC = sprintf("%6.3f", num[OCC_col]);
	if(AOCC_col) AOCC = sprintf("%6.3f", num[AOCC_col]);
	if(BFAC_col) BFAC = sprintf("%8.3f", num[BFAC_col]);

	if((AOCC=="?.???")&&(OCC!="?.???"))
	{
	    # might as well...
	    AOCC = sprintf("%6.3f", sqrt(OCC^2));
	}
	
	# now print out the site
	printf " ATOM%-3d ANO  %6.5f %6.5f %6.5f %6s %6s BFAC %8s\\n", 
	  site, X, Y, Z, OCC, AOCC, BFAC;
    }
}
EOF-sitereader
chmod a+x ${tempfile}sitereader.awk


goto Return_Unwrap_Awk_Scripts


####################################################

The Future:
compare two sets of sites for Patterson-equivalence
new dataset weighting scheme?