#! /bin/csh -f # # pick.com - James Holton 2-19-11 # # Pick unique peaks in a map, # avoiding "map-edge" false peaks # optionally avoiding a list of "boring" positions # in a "symmetry-aware" fashion # # defaults set mapfile = "maps/FH_Four.map" set pdbfile = "" set logfile = "pick_details.log" set outfile = "pick.pdb" set sigma = 6 set tempfile = pick_temp # set this to wherever your awk program is alias nawk /usr/bin/nawk nawk 'BEGIN{print}' >& /dev/null if($status) alias nawk awk if("$1" == "") goto Help echo -n "" >! $logfile ################################################################################ goto Setup # set/reset $mapfile $pdbfile $sigma from command-line Help: cat << EOF usage: $0 $mapfile [$sigma] [boring.pdb] where: $mapfile is the map you want to pick $sigma (optional) is the minimum peak height (sigma units) boring.pdb (optional) sites to avoid in peak-picking EOF exit 9 Return_from_Setup: ################################################################################ if(! -e "$mapfile") goto Help set sign = `echo "$sigma" | nawk '$1+0<0{print "+/-" 0-$1} $1+0>0{print $1}'` set print_sigma = "${sign}*sigma peaks" if($?top_only) set print_sigma = "highest peak" echo -n "looking for $print_sigma in $mapfile " if(-e "$sitefile") then echo "not already withing ${CLOSE_peaks}A" echo -n "of the $boring_sites atoms listed in $sitefile" endif echo "" # extract a single ASU from the input map mapmask mapin $mapfile mapout ${tempfile}asu.map << EOF-xtend | tee ${tempfile}xtend >> $logfile #scale sigma xyzlim ASU # re-axis to X,Y,Z AXIS X Y Z # fill blank spaces with zero pad 0 EOF-xtend # make sigma=1 and mean=0 echo "scale sigma" | mapmask mapin ${tempfile}asu.map mapout ${tempfile}sigma.map >> $logfile mv ${tempfile}sigma.map ${tempfile}asu.map # get size of the ASU cat ${tempfile}xtend |\ nawk '/Grid sampling on x, y, z/{print $(NF-2), $(NF-1), $NF;} \ /Start and stop points on x, y, z/{print $(NF-5), $(NF-4), $(NF-3), $(NF-2), $(NF-1), $NF}' |\ nawk 'NF==3{gx=$1;gy=$2;gz=$3}\ NF==6{print "ASU", $1/gx, $2/gx, $3/gy, $4/gy, $5/gz, $6/gz}' |\ cat >! ${tempfile}asu rm -f ${tempfile}xtend >& /dev/null # calculate a 10% "edge pad" set xyzlim = `nawk '/^ASU/{print $2-0.1, $3+0.1, $4-0.1, $5+0.1, $6-0.1, $7+0.1}' ${tempfile}asu` set asu = `nawk '/^ASU/{print $2, $3, $4, $5, $6, $7}' ${tempfile}asu` # re-extend the map by a 10% pad in every direction echo "xyzlim $xyzlim" |\ mapmask mapin ${tempfile}asu.map mapout ${tempfile}pick.map >> $logfile rm -f ${tempfile}asu.map >& /dev/null repeat: # reformat to peakmax vernacular set sigma = `echo $sigma | awk '$1+0>0{print $1} $1+0<0{print -$1,"NEGATIVES"}'` # do the actual peak-pick peakmax MAPIN ${tempfile}pick.map PEAKS ${tempfile}.xyz TO ${tempfile}.xyz << eof-pick >> $logfile THRESHOLD $sigma OUTPUT PEAKS END eof-pick if($status) then grep "Threshold too high" $logfile >& /dev/null if(! $status) then set sigma = `echo "$sigma" | nawk '$1+0>0.5{print $1*2/3}'` if("$sigma" == "") then echo "no peaks." set BAD goto cleanup endif echo "reducing sigma to $sigma" goto repeat endif endif rm -f ${tempfile}pick.map >& /dev/null # re-format the peaks list (with no stuck-together numbers) cat ${tempfile}.xyz |\ nawk 'NF>6 && /[^1-9.-]/{ \ print substr($0,23,8), substr($0,31,8), substr($0,39,8),\ substr($0,49,8), substr($0,57,8), substr($0,65,8), substr($0,6,8)}' |\ cat >! ${tempfile}peaks.pick rm -f ${tempfile}.xyz >& /dev/null # now filter out out-of-bounds (map-edge) peaks ################################################################ # trim off peaks outside the CCP4 ASU limits # (they should either be redundant, or map-edge peaks) cat ${tempfile}asu ${tempfile}peaks.pick |\ nawk '/^ASU/{xmin=$2-0;ymin=$4-0;zmin=$6-0;\ xmax=$3+0;ymax=$5+0;zmax=$7+0;next;} \ {x=$1+0;y=$2+0;z=$3+0}\ x>=xmin && x<=xmax && y>=ymin && y<=ymax && z>=zmin && z<=zmax {print}' |\ sort -nr --key=7 >! ${tempfile}peaks.trimmed # add another 5% pad (just in case we lost a few) cat ${tempfile}asu ${tempfile}peaks.pick |\ nawk -v pad=0.05 '/^ASU/{xmino=$2-pad;ymino=$4-pad;zmino=$6-pad;\ xmaxo=$3+pad;ymaxo=$5+pad;zmaxo=$7+pad;\ xmini=$2-0;ymini=$4-0;zmini=$6-0;\ xmaxi=$3+0;ymaxi=$5+0;zmaxi=$7+0;\ next;} \ {x=$1+0;y=$2+0;z=$3+0}\ x>=xmini && x<=xmaxi && y>=ymini && y<=ymaxi && z>=zmini && z<=zmaxi {next}\ x>xmino && xymino && yzmino && z> ${tempfile}peaks.trimmed rm -f ${tempfile}peaks.pick ${tempfile}asu >& /dev/null # these peaks are sorted by "priority" of their ASU convention # ${tempfile}peaks.trimmed had # format: xf yf zy X Y Z height if($?top_only) then head -n 1 ${tempfile}peaks.trimmed >! ${tempfile} mv ${tempfile} ${tempfile}peaks.trimmed endif ################################################################ # near-edge peaks have probably been counted twice, so we need # to filter them out # generate ALL symmetry-equivalent positions for the trimmed peaks cat << EOF >! ${tempfile}gensym.in SYMM $SGnum CELL $CELL XYZLIM $xyzlim EOF cat ${tempfile}peaks.trimmed |\ nawk '{++n; print "RESIDUE",n; print "ATOM X", $1, $2, $3}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, $(NF-1), "sym"}' |\ cat >! ${tempfile}peaks.symm rm -f ${tempfile}gensym.in >& /dev/null # ${tempfile}peaks.symm is now an indexed list of all # symmetry-related peak positions within $xyzlim #format: xf yf zf X Y Z peak# "sym" # to preserve the ASU coordinates: # sort the trimmed coordinates into the list so # they will be the "first" symmetry mate considered # for each "site" cat ${tempfile}peaks.trimmed |\ nawk '{++n; print $1, $2, $3, $4, $5, $6, n, $NF*$NF, $NF}' |\ cat - ${tempfile}peaks.symm |\ sort --key=7n --key=8nr >! ${tempfile}peaks.xpanded # sort +6n -7 +7nr -8 rm -f ${tempfile}peaks.trimmed >& /dev/null rm -f ${tempfile}peaks.symm >& /dev/null # now filter the symmetry-expanded list for the # unique list of non-symmetry related peaks cat ${tempfile}peaks.xpanded |\ nawk '! seen[$1 " " $2 " " $3] {print} {seen[$1 " " $2 " " $3]=1}' |\ nawk -v cut=$CLOSE_peaks '$NF!="sym"{height[$7]=$NF}\ NF>3{++n; X[n]=$4; Y[n]=$5; Z[n]=$6; site[n]=$7;\ # compare this peak to all sites seen so far \ for(i=1;i! ${tempfile}peaks.reduced rm -f ${tempfile}peaks.xpanded >& /dev/null # ${tempfile}peaks.reduced should now contain only unique peaks from $mapin # format: xf yf zf X Y Z height ################################################################ # now look for special positions cat ${tempfile}peaks.reduced |\ nawk 'NF>0{++n; print "RESIDUE",n; print "ATOM X", $1, $2, $3}' |\ cat >! ${tempfile}gensym.in gensym << EOF >! ${tempfile}.log SYMM $SGnum CELL $CELL XYZLIM 0 0.999999 0 0.999999 0 0.999999 @${tempfile}gensym.in EOF # count the number of times each site is "seen" # more than once implies a special position cat ${tempfile}.log |\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $(NF-1), $NF}' |\ nawk '{++seen[$1 " " $2 " " $3 " " $4]}\ END{for(site in seen) print site, seen[site]}' |\ sort -un --key=4 |\ nawk '$5+0>0{print $4, 1/$5}' >! ${tempfile}occs # sort -un +3 rm -f ${tempfile}.log >& /dev/null # now add these "occupancies" to the master list cat ${tempfile}occs ${tempfile}peaks.reduced |\ nawk 'NF==2{occ[$1]=$2} \ NF>2{++n;print $1,$2,$3,$4,$5,$6,occ[n],$7}' |\ cat >! ${tempfile}peaks.final rm -f ${tempfile}peaks.reduced >& /dev/null # ${tempfile}peaks.final now contains the "final" list of output peaks # and should faithfully represent the unique peaks in the map # format: xf yf zf X Y Z 1/mult height set peaks = `cat ${tempfile}peaks.final | wc -l` echo -n "$peaks found " rm -f ${tempfile}occs >& /dev/null ################################################################ # filter out "boring" sites if(! -e ${tempfile}boring_sites) touch ${tempfile}boring_sites # format: xf yf zf # count number of "boring" sites set boring_sites = `cat ${tempfile}boring_sites | wc -l` # symmetry-expand the "boring" sites cat << EOF >! ${tempfile}gensym.in SYMM $SGnum CELL $CELL XYZLIM $xyzlim EOF cat ${tempfile}boring_sites |\ nawk 'NF>2{print "ATOM X", $1, $2, $3}' |\ cat >> ${tempfile}gensym.in cat ${tempfile}gensym.in | gensym |&\ nawk '/List of sites/,/Normal termination/' |\ nawk '$2 ~ /[01].[0-9][0-9][0-9]/{print $2, $3, $4, $5, $6, $7, "boring"}' |\ cat >! ${tempfile}boring_sites.symm rm -f ${tempfile}gensym.in >& /dev/null # ${tempfile}all_boring_sites now contains ALL symmetry-equivalent # positions to the sites the user entered on the command-line # format: xf yf zf X Y Z "boring" # now remove peaks that were too close to "boring" sites cat ${tempfile}boring_sites.symm ${tempfile}peaks.final |\ nawk -v cut=$CLOSE_peaks ' \ $NF=="boring"{++n; X[n]=$4; Y[n]=$5; Z[n]=$6} \ $NF!="boring"{minD=999999; \ # find nearest "boring" site \ for(i=1;i<=n;++i){\ dist=sqrt(($4-X[i])^2 +($5-Y[i])^2 +($6-Z[i])^2);\ if(dist < minD){\ minD=dist;}}\ # now see if it is too close \ if(minD > cut) {print}}' |\ cat >! ${tempfile}peaks.interesting rm -f ${tempfile}boring_sites.symm >& /dev/null rm -f ${tempfile}peaks.final >& /dev/null # sort the picked peaks by height sort -nr -k8 ${tempfile}peaks.interesting >! ${tempfile} mv ${tempfile} ${tempfile}peaks.interesting # ${tempfile}peaks.interesting now contains only "interesting" peaks # format: xf yf zf X Y Z 1/mult height # see how many are left set interesting = `cat ${tempfile}peaks.interesting | wc -l` if($boring_sites) echo -n "($interesting) new" echo "" ################################################################ # make a pdb output file echo "REMARK B-factors are peak heights in $mapfile" >! $outfile echo "REMARK Occs are 1/multiplicity (for special positions)" >> $outfile echo "$CELL" | nawk '{printf "CRYST1%9.3f%9.3f%9.3f%7.2f%7.2f%7.2f\n",\ $1, $2, $3, $4, $5, $6}' >> $outfile cat ${tempfile}peaks.interesting |\ nawk '{++i; printf "ATOM %4d OW WAT X%4d %8.3f%8.3f%8.3f%6.2f%6.2f\n",\ i, i, $4, $5, $6, $7, $8}' |\ cat >> $outfile ################################################################ # calculate distance to nearest "old" site # by default, use the "boring" list as "old" sites cat ${tempfile}boring_sites >! ${tempfile}old_sites if(! -e "$sitefile") then # no sites input, so just get inter-peak distances cat ${tempfile}peaks.interesting >! ${tempfile}old_sites endif if($?scriptfile) then # input file was an mlphare script, so only interested in real atoms cat "$scriptfile" |\ nawk '$1~/^ATOM/{print $3, $4, $5}' |\ cat >! ${tempfile}old_sites endif # count number of "old" sites set old_sites = `cat ${tempfile}old_sites | wc -l` # calculate the largest expected inter-atom distance (cell center to origin) echo "0.5 0.5 0.5" |\ nawk 'NF>2{++n; printf "%5d%10.5f%10.5f%10.5f%10.5f%5.2f%5d%10d%2s%3s%3s %1s\n", \ n, $1, $2, $3, 80, 1, "38", n, "H", "", "IUM", " "}' |\ cat >! ${tempfile}.frac coordconv XYZIN ${tempfile}.frac \ XYZOUT ${tempfile}.pdb << EOF-conv >& /dev/null CELL $CELL INPUT FRAC OUTPUT PDB ORTH 1 END EOF-conv cat ${tempfile}.pdb |\ nawk '/^ATOM/{print substr($0, 31, 8), substr($0, 39, 8), substr($0, 47, 8)}' |\ cat >! ${tempfile}center set max_dist = `nawk '{print sqrt($1*$1 + $2*$2 + $3*$3)+3}' ${tempfile}center` rm -f ${tempfile}.frac >& /dev/null rm -f ${tempfile}.pdb >& /dev/null rm -f ${tempfile}center >& /dev/null # convert "old" sites to a pdb cat ${tempfile}old_sites |\ nawk 'NF>2{++n; printf "%5d%10.5f%10.5f%10.5f%10.5f%5.2f%5d%10d%2s%3s%3s %1s\n", \ n, $1, $2, $3, 80, 1, "38", n, "C", "", "IUM", "A"}' |\ cat >! ${tempfile}old.frac coordconv XYZIN ${tempfile}old.frac \ XYZOUT ${tempfile}old.pdb << EOF-conv >& /dev/null CELL $CELL INPUT FRAC OUTPUT PDB ORTH 1 END EOF-conv rm -f ${tempfile}old.frac >& /dev/null # strip off unneeded cards nawk '/^ATOM/ || /^CRYS/ || /^SCALE/' ${tempfile}old.pdb |\ cat >! ${tempfile}both.pdb rm -f ${tempfile}old.pdb >& /dev/null # append peak list to the combined PDB file nawk '/^ATOM/{print}' $outfile >> ${tempfile}both.pdb @ start_peaks = ( $old_sites + 1 ) # renumber the atoms so that distang won't get confused cat ${tempfile}both.pdb |\ nawk '/^ATOM/{++n; $0 = sprintf("ATOM %5d%s",n,substr($0,12))} {print $0}' |\ cat >! ${tempfile}.pdb mv ${tempfile}.pdb ${tempfile}both.pdb >& /dev/null # use distang to calculate all inter-atom distances (and then sort them) distang xyzin ${tempfile}both.pdb << EOF |\ nawk '$1=="Z"{print "dist", $6, $2, $9}' | sort -n --key=4 |\ nawk '! seen[$2]{seen[$2]=1;print}' | sort -n --key=2 >! ${tempfile}dists SYMM $SGnum DIST ALL RADII C 1 RADII OW $max_dist DMIN $CLOSE_peaks FROM ATOM 1 to $old_sites TO ATOM $start_peaks to 99999 END EOF rm -f ${tempfile}both.pdb >& /dev/null #${tempfile}dists now contains the minimum # format: peak# old# min_dist # gaurentee a label file for the "old" sites cat ${tempfile}old_sites |\ nawk 'NF>2{++n; printf "label %5d atom %d\n", n, n}' |\ cat >! ${tempfile}labels # get a descriptive label from the "old site" source file if(-e "$pdbfile") then # input file was a PDB file cat $sitefile |\ nawk '/^ATOM/ || /^HETATM/{++n; printf "label %5d %s\n", n, substr($0,12,15)}' |\ cat >! ${tempfile}labels endif if($?scriptfile) then # input file was an mlphare script cat $sitefile |\ nawk '$1~/^DERIV/{deriv=$0}\ $1~/^ATOM/{++n; printf "label %5d %6s in %s\n", n, $1, deriv}' |\ cat >! ${tempfile}labels endif if(! -e "$sitefile") then # peaks list was used for distnace self-caclulation cat ${tempfile}peaks.interesting |\ nawk 'NF>2{++n;printf "label %5d peak %d\n", n, n}' |\ cat >! ${tempfile}labels endif # add these descriptive labels to the peaks list cat ${tempfile}dists ${tempfile}labels ${tempfile}peaks.interesting |\ nawk '/^dist/{dist[$2]=$4; neighbor[$2]=$3; next}\ /^label/{label[$2]=substr($0,13); next}\ {++n; print $0, dist[n], "label:", label[neighbor[n]]}' |\ cat >! ${tempfile}peaks.distlabel # # format: xf yf zf X Y Z 1/mult height dist neighbor name ... ################################################################ # print surviving peaks out to screen echo "" echo "unique peaks:" set xyz = "x y z" if($?PATT) set xyz = "u v w" echo " $xyz mult height/sigma dist from nearest neighbor" cat ${tempfile}peaks.distlabel |\ nawk '{printf "%8.5f %8.5f %8.5f %4d %8.2f %10.1fA %s\n", $1, $2, $3, 1/$7, $8,\ $9, substr($0, index($0,"label:")+7)}' echo "written to $outfile" cleanup: # clean up rm -f ${tempfile}labels >& /dev/null rm -f ${tempfile}dists >& /dev/null rm -f ${tempfile}peaks.distlabel >& /dev/null rm -f ${tempfile}peaks.interesting >& /dev/null rm -f ${tempfile}boring_sites >& /dev/null rm -f ${tempfile}old_sites >& /dev/null if($?BAD) exit 9 exit ################################################################ ################################################################ ################################################################ Setup: set siteCELL set sitefile # scan command line foreach arg ( $* ) # recognize map files if(("$arg" =~ *.map)||("$arg" =~ *.ext)) then if(! -e "$arg") then echo "WARNING: $arg does not exist! " continue endif set mapfile = "$arg" continue endif # recognize pdb files if(("$arg" =~ *.pdb)||("$arg" =~ *.brk)) then if(! -e "$arg") then echo "WARNING: $arg does not exist! " continue endif set pdbfile = "$arg" set siteCELL = `nawk '/^CRYST/{print $2, $3, $4, $5, $6, $7}' $pdbfile | tail -1` continue endif # recognize mlphare scripts? if(-e "$arg") then cat "$arg" |\ nawk '$1~/^ATOM/ || $1~/^BADATOM/ || $1~/^OLDATOM/{print $3, $4, $5}' |\ cat >! ${tempfile}boring_sites # format: xf yf zf set sitefile = "$arg" set scriptfile = "$arg" endif # recognize sigma-cutoff set temp = `echo "$arg" | awk '$1+0 != 0 && ! /A$/{print $1+0}'` if("$temp" != "") then set sigma = "$temp" continue endif # recognize closeness-cutoff set temp = `echo "$arg" | awk '$1+0 != 0 && /A$/{print $1+0}'` if("$temp" != "") then set CLOSE_peaks = "$temp" continue endif # recognize "top peak only" flag if("$arg" =~ "-top"*) then set top_only continue endif end # get map parameters echo "go" | mapdump mapin $mapfile >! ${tempfile}.mapdump if($status) goto Help set CELL = `nawk '/Cell dimensions/{print $4, $5, $6, $7, $8, $9; exit}' ${tempfile}.mapdump` set SGnum = `nawk '/Space-group/{print $3; exit}' ${tempfile}.mapdump` set SG = `nawk -v SGnum=$SGnum '$1==SGnum {print $4}' $CLIBD/symop.lib | head -1` set GRID = `nawk '/Grid sampling on x, y, z/{print $(NF-2), $(NF-1), $NF; exit}' ${tempfile}.mapdump` rm -f ${tempfile}.mapdump >& /dev/null # see if this is a Patterson map set temp = `nawk -v SGnum="$SGnum" '$1==SGnum{print $4}' $CLIBD/symop.lib | nawk '/[abcdmn-]/{print "PATT"}'` if("$temp" != "") set PATT # convert input coordinate file formats to fractional if($#siteCELL != 6) set siteCELL = ( $CELL ) if(-e "$pdbfile") then # convert orthogonal PDB coordinates to fractional coordconv xyzin $pdbfile xyzout ${tempfile}.xyz << EOF >> $logfile CELL $siteCELL INPUT PDB OUTPUT FRAC END EOF # all we need are fractional coordinates cat ${tempfile}.xyz |\ nawk '{print $2, $3, $4}' |\ cat >! ${tempfile}boring_sites rm -f ${tempfile}.xyz >& /dev/null set sitefile = "$pdbfile" endif if(! -e "${tempfile}boring_sites") touch ${tempfile}boring_sites set boring_sites = `cat ${tempfile}boring_sites | wc -l` # format: xf yf zf # decide on a "closeness" cutoff for two peaks being the same if(! $?CLOSE_peaks) set CLOSE_peaks if("$CLOSE_peaks" == "") then # set the "close" criteria to be one grid unit echo "$GRID $CELL" |\ nawk '$1+0>0{print $4/$1}\ $2+0>0{print $5/$2}\ $3+0>0{print $6/$3}' |\ sort -n >! ${tempfile}close set CLOSE_peaks = `nawk 'NR==1{printf "%.2f", $1}' ${tempfile}close` rm -f ${tempfile}close >& /dev/null endif # guess? if("$CLOSE_peaks" == "") set CLOSE_peaks = 0.5 goto Return_from_Setup exit ################################# # the future? - support other coordinate file formats