#! /bin/csh -f # # # crosscheck.com # # script for cross-checking potential new heavy atom sites # with a list of "known" sites and/or a Patterson map # # sites can be entered explicitly, or picked from a phased # difference Fourier map. # # set this to wherever your awk program is set nawk = nawk nawk 'BEGIN{print}' >& /dev/null if($status) set nawk = awk alias nawk $nawk set outfile = ./crosscheck.list set logfile = ./crosscheck.log set Patterson_map = ./best_Patt.map set Fourier_map = ./best_phased_diff.map set sitefile = "" set tempfile = ./crosscheck_temp # default cutoff values set hiRES = 2.7 # upper resolution limit of map(s) set loRES = 1000 # lower resolution cutoff for map-making set SG = "" # space group (set by phased map) set CELL = "" # unit cell (also read from map) set PICK_cutoff = 3 # sigma cutoff for peak-picking set CLOSE_peaks = "" # default: 1/3 * $hiRES # used at the end set type = " old" # RSPS limit (change this if you recompile RSPS) set MAXPAT = 800000 # no args, print help if($#argv == 0) goto Help # this procedure (re)sets most of the above variables # from either the provided files, or the command line goto Setup Help: cat << EOF usage: $0 phased_diff.map Patterson.map oldsites.txt 2.7A 3 sigma P212121 where: phased_diff.map - a phased difference Fourier Patterson.map - a Patterson to check sites against 2.7A - desired resolution cutoff 3 sigma - sigma cutoff for peak-picking P212121 - your space group (if only Patterson.map is given) oldsites.txt - a list of fractional coordinates of "known" sites (can be almost any format, a pdb, mlphare script, or shelx .res file will do) $0 will: Pick peaks > (sigma cutoff) in the phased difference Fourier or just use the provided peaks in "sitefile" Any picked peaks within (1/3 resolution)A of a "known" site (from oldsites.txt) are ignored, as are "map edge sites" and extra symmetry mates. The remaining peaks are then cross-checked with the Patterson map. Here, the "score" is the average of the Patterson map values at every predicted vector (see manual for rsps). In addition, the expected cross-peaks to the "known" sites are scored. If no Patterson is provided, it will be calculated from the phased_diff.map. A sorted list of all these sites is then printed out in: $outfile with the format: frac Angsrm nearest phased map Patterson score x y z X Y Z old peak height/sig harker cross | | | | | | | | | | What you do with this list is up to you. :) EOF exit # # This procedure (at the bottom of the script) does the following # 1) scan the command line for mtz and mlphare script files # 2) set the CELL, SG, and hiRES variables # 3) generate dataset name lists: ${tempfile}Fs and ${tempfile}Ds # 3) generate "known site" lists: ${tempfile}oldsites_* ${tempfile}badsites Return_from_Setup: # make sure we have something to work with if((! -e "$sitefile")&&(! -e "$Fourier_map")) then # no way to get sites echo "ERROR: no sites to check! " echo " please specify either a map or a filename containing" echo " your metal sites." goto Help endif if("$SG" == "") then # must have given a Patterson, but no space group echo "ERROR: we don't have a space group. " echo "" echo "please run:" echo "$0 P212121 $* " echo "or whatever your space group is." echo "" goto Help endif ################################################################ # initial report on intended program flow ################################################################ # start the logfile echo "" >! $logfile if(-e "$Fourier_map") then echo "Looking for sites > ${PICK_cutoff}*sigma in $Fourier_map" | tee -a $logfile if(-e "$sitefile") then set oldsites = `cat ${tempfile}oldsites_reduced | wc -l` set temp = `cat ${tempfile}badsites | wc -l` if($temp != 0) set oldsites = "$oldsites (plus $temp bad)" echo "that are not already within ${CLOSE_peaks}A" | tee -a $logfile echo "of the $oldsites atoms in $sitefile" | tee -a $logfile endif else if(-e "$sitefile") then echo "the $oldsites sites in $sitefile will be checked" | tee -a $logfile if(-e "$Patterson_map") then echo "against $Patterson_map" | tee -a $logfile else # should never get here goto Help endif echo "" | tee -a $logfile # no need to pick peaks goto RSPS endif endif Pick_Peaks: ################################################################ ##### # #### # # # # # # # # # # # # # #### ##### # # # # # # # # # # # # #### # # ################################################################ # pick peaks in the difference Fourier to find new sites ################################################################ # redundant catch if(! -e "$Fourier_map") then # uhh, assume we have some peaks goto RSPS endif echo "" # avoid roundoff errors in height / confusions about sigma # re-scale map so sigma = 1 # and extend it out just a bit (for peak-picking) mapmask MAPIN $Fourier_map \ MAPOUT ${tempfile}norm.map << EOF-extend >> $logfile SCALE SIGMA XYZLIM -0.1 1.1 -0.1 1.1 -0.1 1.1 EOF-extend # pick peaks in the extended, normalized map set temp = `echo "$PICK_cutoff" | nawk '$1+0>=0{print $1} $1+0<0{print -$1, "NEGATIVES"}'` peakmax MAPIN ${tempfile}norm.map \ TO ${tempfile}.xyz PEAKS ${tempfile}.xyz << eof-pick >> $logfile THRESHOLD $temp #NUMPEAKS 20 OUTPUT PEAKS END eof-pick if($status) then set temp = `echo "$PICK_cutoff" | nawk '{printf "%d", $1-1}'` if("$temp" > 0) then echo "reducing pick cutoff to $temp sigma" set PICK_cutoff = "$temp" goto Pick_Peaks endif # otherwise, we're just screwed echo "unable to pick peaks in $Fourier_map" echo "sorry! " goto Finish endif echo -n "analyzing peaks " # condense peak data to essential list # and remove picks outside the cell # (we don't need them, and they could be map-edge picks) cat ${tempfile}.xyz |\ nawk 'NF>6 && /[^1-9.-]/{ \ print substr($0,23,8), substr($0,31,8), substr($0,39,8), substr($0,6,8)}' |\ nawk '$1+0>0 && $1+0<1 && $2+0>0 && $2+0<1 &&$3+0>0 && $3+0<1' |\ sort -nr +3 >! ${tempfile}peaks # # format is xf yf zf height/sigma # # clean up rm -f ${tempfile}.xyz >& /dev/null rm -f ${tempfile}norm.map >& /dev/null ################################################################ # FILTER OUT DUPLICATE SITES ################################################################ # because we picked a whole unit cell above (and then some) # we need to remove symmetry mates from the peak list # first, generate ALL symmetry-equivalent positions for the picked peaks cat << EOF >! ${tempfile}gensym.in SYMM $SG CELL $CELL EOF cat ${tempfile}peaks |\ nawk '{++n; print "RESIDUE",n; print "ATOM X", $1, $2, $3}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1)}' |\ cat >! ${tempfile}peaks_symm rm -f ${tempfile}gensym.in >& /dev/null # ${tempfile}peaks_symm is now an indexed list of all # symmetry-related peak positions in the unit cell # record peak heights (since they didn't survive gensym) cat ${tempfile}peaks |\ nawk '{++n; print n, $NF}' |\ cat >! ${tempfile}peak_heights # now filter the symmetry-expanded list for the # unique list of non-symmetry related peaks cat ${tempfile}peak_heights ${tempfile}peaks_symm |\ nawk -v cut=$CLOSE_peaks 'NF==2{height[$1]=$2}\ NF>3{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; group[n]=$NF;\ for(i=1;i! ${tempfile}peaks ################################### # now ${tempfile}peaks is the list of symmetry-unique map peaks # (without any map-edge picks) # in the format xf yf zf X Y Z height/sigma # clean up rm -f ${tempfile}peak_heights >& /dev/null rm -f ${tempfile}peaks_symm >& /dev/null ################################################################ # FILTER OUT UNINTERESTING SITES ################################################################ if(! -e ${tempfile}oldsites_original) touch ${tempfile}oldsites_original if(! -e ${tempfile}badsites) touch ${tempfile}badsites # symmetry-expand the "known" and "known bad" sites cat << EOF >! ${tempfile}gensym.in SYMM $SG CELL $CELL XYZLIM -0.2 1.2 -0.2 1.2 -0.2 1.2 EOF cat ${tempfile}oldsites_original ${tempfile}badsites |\ nawk '{print "ATOM X", $1, $2, $3}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, "known"}' |\ cat >! ${tempfile}all_known_sites rm -f ${tempfile}gensym.in >& /dev/null # ${tempfile}all_known_sites now contains ALL symmetry-equivalent # positions to the sites the user entered as "known" or "bad" set newsites = `cat ${tempfile}peaks | wc -l` echo -n "$newsites found > ${PICK_cutoff}*sigma, " # filter out peaks within $CLOSE_peaks Angstroms of old or bad sites # AND add the distance to the nearest cat ${tempfile}all_known_sites ${tempfile}peaks |\ nawk -v cut=$CLOSE_peaks ' \ $NF=="known"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6} \ $NF!="known"{minD=999999; \ # find nearest "old" site \ for(i=1;i<=n;++i){\ dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\ if(dist < minD){\ minD=dist;}}\ # now see if it is too close \ if(minD > cut) {print}}' |\ sort -nr +6 >! ${tempfile} mv ${tempfile} ${tempfile}peaks # don't need this anymore rm -f ${tempfile}badsites >& /dev/null rm -f ${tempfile}all_known_sites >& /dev/null ################################### # ${tempfile}peaks still has the format: # xf yf zf X Y Z height/sigma # set newsites = `cat ${tempfile}peaks | wc -l` echo "($newsites new)." Patterson: ################################################################ ##### ## ##### ##### ###### ##### #### #### # # # # # # # # # # # # # # ## # # # # # # # ##### # # #### # # # # # ##### ###### # # # ##### # # # # # # # # # # # # # # # # # # # ## # # # # # ###### # # #### #### # # ################################################################ # make Patterson, if we don't have it ################################################################ if(-e "$Patterson_map") then # we don't need to calculate it then goto RSPS endif if(! -e "$Fourier_map") then # should never get here either echo "ERROR: no maps available! " goto Help endif # get maximum possible resolution (to fit into RSPS) set PattRES = `echo $CELL | nawk -v pts=$MAXPAT '{printf "%.2f\n", 3*(($1 * $2 * $3)/pts)^(1/3)}'` set PattRES = `echo "$PattRES $hiRES" | nawk '$1>$2{print $1} $2>$1{print $2}'` # use sfall to back-transform the difference map echo "" echo "making Patterson from $Fourier_map at $PattRES A" # overdrive the sampling a bit set temp = `echo $mapRES | nawk '{print 0.9*$1}'` sfall MAPIN $Fourier_map HKLOUT ${tempfile}reversed.mtz << EOF-sfall >> $logfile SYMM $SG RESOLUTION 1000 $temp MODE SFCALC MAPIN END EOF-sfall if($status) then # hmm, maybe resolution is too high? echo "GOTHERE" exit endif # re-transform as a Patterson Patt_again: # calculate the Patterson map fft HKLIN ${tempfile}reversed.mtz MAPOUT ${tempfile}_Patt.map << EOF-fft >> $logfile RESOLUTION $loRES $PattRES PATTERSON title Patterson version of $Fourier_map LABIN F1=FC END EOF-fft # check to make sure it's not too big set size = `ls -l ${tempfile}_Patt.map | nawk '{printf "%d\n", $5/4}'` if("$size" > $MAXPAT) then # this is not going to work in RSPS # make it smaller # crude cell volume set PattRES = `echo $PattRES | nawk '{print 1.1*$1}'` echo "woops, map too big for rsps, reducing resolution to $PattRES." goto Patt_again endif # clean up rm -f ${tempfile}reversed.mtz >& /dev/null set Patterson_map = ${tempfile}_Patt.map RSPS: ################################################################ ##### #### ##### #### # # # # # # # # #### # # #### ##### # ##### # # # # # # # # # # #### # #### ################################################################ # use RSPS to get a score for every putative site with the Patterson ################################################################ if(! -e ${tempfile}peaks) touch ${tempfile}peaks if(! -e ${tempfile}oldsites_reduced) touch ${tempfile}oldsites_reduced set newsites = `cat ${tempfile}peaks | wc -l` set oldsites = `cat ${tempfile}oldsites_reduced | wc -l` echo "" set type = "other" # decide what kind of site data we have if($oldsites > 0) then if($newsites > 0) then # both old and new sites avaialble, so check the # new sites for crossvectors to the old sites echo "checking $newsites new sites against $oldsites old sites on Patterson" ##### normal behavior ####### set type = " old" else # we have old sites, but no new ones if(-e "$Fourier_map") then # we DID, however, pick peaks # they must have all been filtered out echo "no sites to check against Patterson" goto Finish endif # otherwise, score the provided "old" sites against themselves in # the crossvector search echo "checking $oldsites sites against the Patterson" # convert "old site" file to a peak-pick file cat ${tempfile}oldsites_reduced |\ nawk '{print $1, $2, $3, $4, $5, $6, "?.??"}' |\ cat >! ${tempfile}peaks endif else # $oldsites == 0 if($newsites > 0) then # we have new sites, but no old ones # set up "new" as "old" so we can report some kind of # cross-vector score echo "checking $newsites new peaks against the Patterson" # convert picked peaks to an "old site" file cat ${tempfile}peaks |\ nawk '{++n; print $1, $2, $3, $4, $5, $6, "?", "site"}' |\ cat >! ${tempfile}oldsites_reduced # backtrack to "original" file too cat ${tempfile}oldsites_reduced |\ nawk '{print $1, $2, $3}' |\ cat >! ${tempfile}oldsites_original else # no sites at all! echo "no sites to check against the Patterson" goto Finish endif endif # now, for each "new" site calculate the distance to # the nearest "other" site # new-old if we have both # new-new if we had no old sites # old-old if no new peaks were found # symmetry-expand the "old" sites cat << EOF >! ${tempfile}gensym.in SYMM $SG CELL $CELL XYZLIM -0.2 1.2 -0.2 1.2 -0.2 1.2 EOF cat ${tempfile}oldsites_original |\ nawk '{print "RESIDUE", NR; print "ATOM X", $1, $2, $3}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1), "sym"}' |\ cat >! ${tempfile}oldsites_sym rm -f ${tempfile}gensym.in >& /dev/null # label "old" sites that are really "new" if($oldsites == 0) then # "old" sites are really "new" sites, indicate this in the file cat ${tempfile}oldsites_sym |\ nawk '{$7= $7 "?"; print}' |\ cat >! ${tempfile} mv ${tempfile} ${tempfile}oldsites_sym endif # measure the distances cat ${tempfile}oldsites_sym ${tempfile}peaks |\ nawk '# record "old" site positions \ $NF=="sym"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; site[n]=$7} \ $NF!="sym"{++j; minD=999999; for(i=1;i<=n;++i){ \ # find nearest "old" site \ dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\ # only consider distances not flagged as same site \ if(site[i]+0!=j || (site[i] !~ /[?]/ && $0 !~ /[?]/)) \ if(dist < minD) minD=dist}\ # tack this distance to the end of the line \ print $0, minD}' |\ cat >! ${tempfile} mv ${tempfile} ${tempfile}peaks rm -f ${tempfile}oldsites_sym >& /dev/null # ${tempfile}peaks now has the format: # xf yf zf X Y Z height/sigma dist harker: ################################################################ # check sites against harker sections ################################################################ cat << EOF >! ${tempfile}rspsin spacegroup $SG patfile $Patterson_map scorfile ${tempfile}rsps.map reject 0 mode harker scan au EOF # ask for harker score of each peak cat ${tempfile}peaks |\ nawk '{print "vlist", $1, $2, $3}' |\ cat >> ${tempfile}rspsin # get harker scores cat ${tempfile}rspsin | rsps | tee -a $logfile |\ nawk '/Rms deviation from mean/{rms=$NF; if(rms<=0) rms=1}\ $1=="SCORE"{++n; print n, $3, $3/rms}' |\ cat >! ${tempfile}harker_score # check that it worked set temp = `cat ${tempfile}harker_score | wc -l` if($temp == 0) then echo "RSPS failed! " echo "sorry! look in $logfile to see what went wrong." goto Finish endif ################################################################ # filter "fixxyz" sites before doing cross score? ################################################################ # get cross-vector score to known peaks (if any) # but first, check that RSPS can handle the load set oldsites = `cat ${tempfile}oldsites_reduced | wc -l` if($oldsites > 30) then # we have a problem: RSPS can't hold more than 30 fixxyz sites echo "WARNING: RSPS can't handle $oldsites reference sites" echo " cross-validation will be limited to 30 $sitefile sites" echo " which map to the heightst harker peaks." cat << EOF >! ${tempfile}rspsin spacegroup $SG patfile $Patterson_map scorfile ${tempfile}rsps.map reject 0 mode harker EOF # ask for harker score of "old" sites cat ${tempfile}oldsites_reduced |\ nawk '{print "vlist", $1, $2, $3}' |\ cat >> ${tempfile}rspsin # run RSPS cat ${tempfile}rspsin | rsps |& tee -a $logfile |\ nawk '$1=="SCORE"{++n; print n, $3}' |\ sort -nr +1 | head -30 >! ${tempfile}pattscore # screen these top peaks out of the original sites cat ${tempfile}pattscore ${tempfile}oldsites_reduced |\ nawk 'NF==2{hscore[$1]=$2} \ NF>6{++n; if(hscore[n]) print "fixxyz", $1, $2, $3}' |\ cat >! ${tempfile}fixxyz else # okay to use all old sites cat ${tempfile}oldsites_reduced |\ nawk '{print "fixxyz", $1, $2, $3}' |\ cat >! ${tempfile}fixxyz endif ################################################################ # check sites against cross peaks to known sites ################################################################ # now make RSPS input for cross-vector search cat << EOF >! ${tempfile}rspsin spacegroup $SG patfile $Patterson_map scorfile ${tempfile}rsps.map reject 0 mode cross EOF # retrieve "fixed" known sites (even if they are also the querry sites? ) cat ${tempfile}fixxyz >> ${tempfile}rspsin rm -f ${tempfile}fixxyz >& /dev/null # ask for cross-vector score to old peaks echo "scan au" >> ${tempfile}rspsin cat ${tempfile}peaks |\ nawk '{print "vlist", $1, $2, $3}' |\ cat >> ${tempfile}rspsin # get crosspeak scores cat ${tempfile}rspsin | rsps |& tee -a $logfile |\ nawk '/Rms deviation from mean/{rms=$NF; if(rms<=0) rms=1}\ $1=="SCORE"{++n; print n, $3, $3/rms}' |\ cat >! ${tempfile}cross_score # clean up rm -f ${tempfile}rspsin >& /dev/null rm -f ${tempfile}rsps.map >& /dev/null # only if user didn't want it rm -f ${tempfile}bestPatterson.map >& /dev/null # assign Patterson score to data in files cat ${tempfile}harker_score |\ nawk '{score[$1] += $3} \ END{for(i in score) print i, score[i], "harker"}' |\ sort -n >! ${tempfile}pattscore cat ${tempfile}cross_score |\ nawk '{score[$1] += $3} \ END{for(i in score) print i, score[i], "cross"}' |\ sort -n >> ${tempfile}pattscore Finish: ################################################# ###### # # # # #### # # # # ## # # # # # ##### # # # # # #### ###### # # # # # # # # # # # # ## # # # # # # # # # # #### # # ################################################# # # put all these scores into the same file # ################################################# if(! -e "${tempfile}pattscore") touch ${tempfile}pattscore if(! -e "${tempfile}peaks") touch ${tempfile}peaks # put all scores on the same line cat ${tempfile}pattscore ${tempfile}peaks |\ nawk '$NF=="harker"{hscore[$1]=$2} \ $NF=="cross" {cscore[$1]=$2} \ NF>6{++n; print $0, hscore[n], cscore[n]}' |\ cat >! ${tempfile}scores echo "" >! $outfile echo "fractional coordinates Angstrom coordinates nearest height/sig Patterson map score " >> $outfile echo " x y z X Y Z $type site on Fourier harker cross " >> $outfile cat ${tempfile}scores |\ nawk '{printf "%8.5f %8.5f %8.5f %8.3f %8.3f %8.3f %8.2fA %14s %14.5f %14.5f\n",\ $1, $2, $3, $4, $5, $6, $8, $7, $9, $10}' |\ sort +7nr -8 +8nr -9 +9nr -10 >> $outfile set newsites = `cat ${tempfile}scores | wc -l` if($newsites > 0) then echo "" echo "all peaks and scores written to $outfile" endif # now figure out what to do with it! # clean up Clean_up_sites: rm -f ${tempfile}harker_score >& /dev/null rm -f ${tempfile}cross_score >& /dev/null rm -f ${tempfile}pattscore >& /dev/null rm -f ${tempfile}oldsites_original >& /dev/null rm -f ${tempfile}oldsites_reduced >& /dev/null rm -f ${tempfile}badsites >& /dev/null rm -f ${tempfile}scores >& /dev/null rm -f ${tempfile}peaks >& /dev/null rm -f ${tempfile}_Patt.map >& /dev/null exit Setup: ################################################# #### ###### ##### # # ##### # # # # # # # #### ##### # # # # # # # # # # ##### # # # # # # # #### ###### # #### # ################################################# # # gather information on: # map files # known sites # "bad" sites # resolution limit # ################################################## goto Unwrap_Awk_Scripts # deploy ${tempfile}sitereader.awk Return_Unwrap_Awk_Scripts: set mapfiles = "" # scan the command line for files echo -n "" >! ${tempfile}oldsites_original echo -n "" >! ${tempfile}oldsites_reduced echo -n "" >! ${tempfile}badsites echo -n "" >! ${tempfile}peaks foreach arg ( $* ) # warn about probable mispellings if((! -e "$arg")&&(($arg =~ *.pdb)||($arg =~ *.map))) then echo "WARNING: $arg does not exist" endif if(-e "$arg") then if("$arg" =~ *.pdb) then set pdbfile = "$arg" continue endif if("$arg" =~ *.map) then set mapfiles = "$mapfiles $arg" continue endif # some kind of file, look for fractional coordinates # set temp = `file $arg | nawk '{print $NF}'` # if("$temp" == "text") then set temp = `ls -lLd $arg |& nawk '/^-/{print $5+0}' | head -1` if(("$temp" != "")&&("$temp" != "0")) then if($temp > 10000) then # this is too big continue endif # general XYZ coordinate extraction cat $arg |\ nawk '{for(i=1;i<=NF;++i){\ if((($i ~ /^[01].[0-9][0-9][0-9]/)||\ ($i ~ /^-[01].[0-9][0-9][0-9]/))&&\ ($i+0>=-1.1)&&($i+0<=1.1))\ {++coords}else{coords=0};\ if(coords==3)\ {print $(i-2), $(i-1), $i; coords=0; i=NF}}}' |\ cat >! ${tempfile} set temp = `cat ${tempfile} | wc -l` if($temp > 0) then # might as well use it cp ${tempfile} ${tempfile}oldsites_original >& /dev/null set sitefile = "$arg" endif rm -f ${tempfile} >& /dev/null else echo "${arg} is a "\"$temp\"" file. We need text." endif endif end # user specified a map, Patterson or Fourier? foreach map ( $mapfiles ) # have a look at the map header echo "" |\ mapdump MAPIN $map |\ cat >! ${tempfile}mapheader set mapSG = `nawk '$1=="Space-group"{print $NF}' ${tempfile}mapheader` if("$mapSG" == "") then echo "ERROR: $map is unreadable! Ignoring $map" continue endif set mapSG = `nawk -v SG="$mapSG" '$1==SG{print $4}' $CLIBD/symop.lib` if("$mapSG" == "") then echo "ERROR: $map is in an unknown space group! Ignoring $map" continue endif # set variables from this map set CELL = `nawk '/Cell dimensions/{print $4, $5, $6, $7, $8, $9}' ${tempfile}mapheader` set hiRES = `nawk '/Grid sampling/{grid=$8} /Cell dimensions/ && grid+0>0{print 3*$4/grid}' ${tempfile}mapheader` if(("$mapSG" =~ *[abcdmn]*)||("$mapSG" =~ *-*)) then # this must be a Patterson set Patterson_map = "$map" set userPatt = "$map" # guess the original space group? continue else # must be a real-space map (from same structure?) set Fourier_map = "$map" set userFourier = "$map" set SG = "$mapSG" set mapRES = "$hiRES" continue endif # got here? how? echo "ERROR: unable to use $map" echo " couldn't figure out what kind of map it is." end rm -f ${tempfile}mapheader >& /dev/null # now all filenames have been initialized if((! -e "$Fourier_map")&&(! -e "$Patterson_map")) then # cannot proceed with NO maps # it's not our business to calculate them from mtzs echo "ERROR: no maps specified! " echo " nothing for us to do! " goto Help endif # space group from pdb? if($?pdbfile) then # atoms were entered as a PDB set sitefile = $pdbfile nawk '/^CRYST/{print substr($0, 55, 12); exit}' $pdbfile |\ nawk '{print $1, $2, $3, $4}' |\ nawk '/^[PCIFHR]/{printf "%s", $1; for(i=2;i<=NF;++i){\ if((($i !~ /^[1-6]$/)&&($i !~ /^[1-6][1-5]$/))||\ (substr($i,1,1)+0 < substr($i,2,1)+0)){exit}; printf $i}} \ END{print ""}' |\ cat >! ${tempfile} set pdbSG = `cat ${tempfile}` rm -f ${tempfile} >& /dev/null if("$pdbSG" != "") then # got something, check it: set temp = `nawk -v SG="$pdbSG" '$4==SG{print $4}' $CLIBD/symop.lib` if("$temp" != "") then # use it set SG = "$pdbSG" else echo "WARNING: $pdbSG from $pdbfile is not a real space group." endif endif # may as well check the cells echo "MAP $CELL" |\ nawk '{printf "MAP %8.3f %8.3f %6.1f %6.1f %6.1f %6.1f\n", $2, $3, $4, $5, $6, $7}' |\ cat >! ${tempfile}cell nawk '/^CRYST/' $pdbfile |\ nawk '{printf "PDB %8.3f %8.3f %6.1f %6.1f %6.1f %6.1f\n", $2, $3, $4, $5, $6, $7}' |\ cat >> ${tempfile}cell cat ${tempfile}cell |\ nawk 'NR==1{for(i=2;i<=NF;++i){c[i]=$i;}}\ NR==2{for(i=2;i<=NF;++i){d=sqrt(($i-c[i])^2); if(maxd! ${tempfile} set temp = `head -1 ${tempfile}` if("$temp" == "") set temp = 0 rm -f ${tempfile} if($temp > 3) then echo "WARNING: unit cells" cat ${tempfile}cell echo "are kind of different! " endif rm -f ${tempfile}cell endif # grab other things from the mlphare script? if(-e "$sitefile") then # get resolution set temp = `nawk '/^RESO/ && $NF+0 >0.1 && $NF+0 < 6 {print $NF}' $sitefile` if("$temp" != "") then set hiRES = "$temp" endif # look for space group cat $sitefile |\ nawk '{for(i=1;i<=length($0);++i){c=substr($0,i,1); \ if(c !~ /[PpCcIiFfRr1-6]/){print ""}else{printf "%s", c}}print ""}' |\ nawk '/^[PpCcIiFfRr][1-6]/ && substr($1,2) !~ /[A-Z]/' |\ cat >! ${tempfile} set scriptSG = `cat ${tempfile}` rm -f ${tempfile} >& /dev/null if("$scriptSG" != "") then # got something, check it: set temp = `nawk -v SG="$scriptSG" '$4==SG{print $4}' $CLIBD/symop.lib` if("$temp" != "") then # use it set SG = "$scriptSG" else echo "WARNING: $scriptSG from $sitefile is not a real space group." endif endif endif # grab something from shelx file? # one last pass through command line # allow user overrides of all internal variables set i = 0 while( $i < $#argv ) @ i = ( $i + 1 ) @ nexti = ( $i + 1 ) @ lasti = ( $i - 1 ) if($nexti > $#argv) set nexti = $#argv if($lasti < 1) set lasti = 1 set arg = "$argv[$i]" if(! -e "$arg") then # space group if("$arg" =~ [PpCcIiFfRr][1-6]*) then set temp = `echo $arg | nawk '{print toupper($1)}'` if($?CLIBD) then set temp = `nawk -v SG=$temp '$4 == SG {print $4}' $CLIBD/symop.lib | head -1` endif if("$temp" != "") then # add this SG to the space group list set SG = "$temp" endif endif # see if a dataset label was given if("$arg" =~ [0-9]*) then # we have a number if(("$arg" =~ *A)||("$argv[$nexti]" == "A")) then # user-preferred resolution limits set temp = `echo "$arg" | nawk 'BEGIN{FS="-"} $1+0 > 0.1{print $1+0} $2+0 > 0.1{print $2+0}'` if($#temp != 1) then set temp = `echo $temp | nawk '$1>$2{print $1, $2} $2>$1{print $2, $1}'` if($#temp == 2) then set loRES = "$temp[1]" set hiRES = "$temp[2]" endif else if("$temp" != "") set hiRES = "$temp" endif endif if(("$arg" =~ *[Ss]igma)||("$argv[$nexti]" =~ [Ss]igma)) then set PICK_cutoff = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'` endif if(("$arg" =~ *[Dd]iff)||("$argv[$lasti]" =~ [Dd]iff)) then set MAX_diso = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'` set MAX_dano = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'` endif if(("$arg" =~ *[Dd]iso)||("$argv[$lasti]" =~ [Dd]iso)) then set MAX_diso = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'` endif if(("$arg" =~ *[Dd]ano)||("$argv[$lasti]" =~ [Dd]ano)) then set MAX_dano = `echo "$arg" | nawk '$1+0 > 0{print $1+0}'` endif endif # allow "NO" logic to carry through unset NO if(("$arg" == "no")||("$arg" == "not")) set NO if(("$arg" == "don't")||("$arg" == "ignore")) set NO if("$arg" == "except") set NO endif end # calculate expected spacial discrimination limit as 1/3 resolution limit if("$CLOSE_peaks" == "") then set CLOSE_peaks = `echo "$hiRES" | nawk '{printf "%.2f", $1/3}'` endif ########################################################## # now establish locations of "known" atoms echo -n "" >! ${tempfile}badsites if(-e "$sitefile") then # make sure we handle mlphare scripts properly grep mlphare $sitefile >& /dev/null if(! $status) then # only "ATOM" cards should be used nawk '$1~/^ATOM/{print $3, $4, $5}' $sitefile >! ${tempfile} set temp = `cat ${tempfile} | wc -l` if("$temp" != 0) then # THESE are the atoms we want cross-peaks to mv ${tempfile} ${tempfile}oldsites_original >& /dev/null endif # Phaser's "BADATOM" cards at bottom, unexecuted end of the script cat $sitefile |\ nawk '$1~/^BADATOM/{print $3, $4, $5}' |\ cat >! ${tempfile}badsites # ${tempfile}badsites may or may not have anything in it endif # sites were already extracted into ${tempfile}oldsites_original # during command-line scan (or directly above) endif if($?pdbfile) then # use coordconv to get fractional coordinates of these atoms coordconv XYZIN $pdbfile XYZOUT ${tempfile}oldsites.xyz << EOF-conv > /dev/null INPUT PDB CELL $CELL OUTPUT FRAC END EOF-conv nawk '{print $2, $3, $4}' ${tempfile}oldsites.xyz |\ cat >! ${tempfile}oldsites_original rm -f ${tempfile}oldsites.xyz >& /dev/null endif # at this point, whatever sites are avaiable have been # read into ${tempfile}oldsites_original as fractional x y z set inputsites = `cat ${tempfile}oldsites_original | wc -l` # generate ALL symmetry-equivalent positions for ALL input sites cat << EOF >! ${tempfile}gensym.in SYMM $SG CELL $CELL XYZLIM -0.1 1.1 -0.1 1.1 -0.1 1.1 EOF cat ${tempfile}oldsites_original |\ nawk '{++n; print "RESIDUE",n; print "ATOM X", $0}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1), "old"}' |\ cat >! ${tempfile}oldsites_allsym rm -f ${tempfile}gensym.in >& /dev/null # now filter the symmetry-expanded list for the # unique list of independent sites cat ${tempfile}oldsites_allsym |\ nawk -v cut=$CLOSE_peaks '\ $NF=="old"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; group[n]=$(NF-1); \ for(i=1;i! ${tempfile}oldsites_reduced # we can always regenerate the symm sites rm -f ${tempfile}oldsites_allsym # remember number of unique sites set oldsites = `cat ${tempfile}oldsites_reduced | wc -l` # ${tempfile}oldsites_reduced now contains the XYZ coordinates of # each unique atom site in the $sitefile (or $pdbfile) # in fractional and Angstrom units # labelled as: xf yf zf X Y Z site# "site" # ${tempfile}oldsites_original # will later be combined with bad sites to filter out # "uninteresting" sites from the peak-pick # formatted as: xf yf zf # also, ${tempfile}badsites contains a list of "bad" sites # (as) written by Phaser Elves in their mlphare scripts. # don't need this anymore rm -f ${tempfile}sitereader.awk >& /dev/null goto Return_from_Setup Unwrap_Awk_Scripts: cat << EOF-sitereader >! ${tempfile}sitereader.awk #! $nawk -f # # # Retrieve arbitrarily formatted heavy-metal sites # # looks for contiguous blocks of lines # all of them containing three consecutive numbers written # to 3 or more decimal places and betwen -1.1 and +1.1 # BEGIN{ # estimate of B, to aid in finding real B-factors if(! B) B = 50 if(! wilsonB) wilsonB = B if(! expectB) expectB = wilsonB if(! OCC) OCC = 1 if(! expectOCC) expectOCC = OCC } # line must have at least 3 words NF>=3{ coords=0; for(i=1;i<=NF;++i) { # pattern to recognize fractional coordinates if(((\$i ~ /^[01].[0-9][0-9][0-9]/)||(\$i ~ /^-[01].[0-9][0-9][0-9]/))&& (\$i+0>=-1.1)&&(\$i+0<=1.1)) { ++coords } else { coords=0 } # check for 3 consecutive hits \\ if(coords==3) { # this line is a site ++site; ++sites; XYZ[site] = "" for(i=i-2;i<=NF;++i) { # remember this line if(\$i !~ /[^0-9.-]/) { XYZ[site] = XYZ[site] " " \$i; } } # reset site list with each new block if(sites>site) sites = site; } } if(coords != 3) { # this was not a site, so reset the site counter for the # next block of sites site = 0; } } END { # bail if we found no sites if(! sites) exit # see if we can assign the rest of the numbers maxnums = 10 for(site=1;site<=sites;++site) { nums=split(XYZ[site], num) # remember length of shortest list if(numsmean[max_value]) max_value = i; } # biggest B-factor dominates if(sqrt((mean[max_value]-expectB)^2)/expectB < 0.5) { BFAC_col=max_value; } # start on right side searching for B factor for(i=maxnums;i>3;--i) { # check each column against the expected B value if(sqrt((mean[i]-expectB)^2)/expectB < 0.5) { # this column is within 50% of expected B factor if(! BFAC_col) BFAC_col=i; } } # start at 4th position for occupancy for(i=4;i<=maxnums;++i) { # real occupancy could be positive or negative if((sqrt((sqrt(mean[i]^2)-expectOCC)^2)/expectOCC < 0.8)&&(i != BFAC_col)) { # this column is within 50% of expected occupancy value # (ignoring sign) if(! OCC_col) OCC_col=i; } # anomalous occupancy is always positive, and usually after OCC if((sqrt((mean[i]-sqrt(expectOCC^2))^2)/expectOCC < 0.8) && (i != OCC_col) && (i != BFAC_col)) { # this column is within 50% of expected occupancy value if(! AOCC_col) AOCC_col=i; } } # now print out the last, contiguous block of sites for(site=1;site<=sites;++site) { # preliminary assignment of variables nums=split(XYZ[site], num) X = num[1]; Y = num[2]; Z = num[3]; OCC ="?.???"; AOCC="?.???"; BFAC="?.???"; # assign values (if they are known) if( OCC_col) OCC = sprintf("%6.3f", num[OCC_col]); if(AOCC_col) AOCC = sprintf("%6.3f", num[AOCC_col]); if(BFAC_col) BFAC = sprintf("%8.3f", num[BFAC_col]); if((AOCC=="?.???")&&(OCC!="?.???")) { # might as well... AOCC = sprintf("%6.3f", sqrt(OCC^2)); } # now print out the site printf " ATOM%-3d ANO %6.5f %6.5f %6.5f %6s %6s BFAC %8s\\n", site, X, Y, Z, OCC, AOCC, BFAC; } } EOF-sitereader chmod a+x ${tempfile}sitereader.awk goto Return_Unwrap_Awk_Scripts #################################################### The Future: compare two sets of sites for Patterson-equivalence new dataset weighting scheme?