#! /bin/csh -f # # origins.com - James Holton 8-2-11 # # script for translating one PDB to a number of alternative origins # and checking if the symmetry-expanded atoms are close to those of # another PDB. # # Atoms with the same name and resiude number are compared # set nawk = nawk $nawk 'BEGIN{print}' >& /dev/null if($status) set nawk = awk alias nawk $nawk set SG = "" set right_pdb = "" set wrong_pdb = "" set outfile = neworigin.pdb set tempfile = ${CCP4_SCR}/origins_temp$$ set tempfile = ./origins_temp ################################################################################ goto Setup Help: cat << EOF usage: $0 right_origin.pdb wrong_origin.pdb $SG [correlate] [nochains] where: right_origin.pdb is relative to your "desired" origin wrong_origin.pdb is relative to another origin $SG is your space group wrong_origin.pdb will be moved to every possible origin in ${SG} and then checked to see if the moved, and symmetry-expanded atoms line up with the ones in right_origin.pdb. The results of the origin choice that give the best agreement to the reference pdb will be copied to $outfile Using the word "correlate" on the command line signals the program to ignore atom names in the comparision and instead use the correlation coefficient of calculated electron density maps as the "similarity score" By default, the program breaks up the input PDB files into any "chains" specified therein. You can turn this off by using the word "nochains" on the command line. EOF exit 9 Return_from_Setup: ################################################################################ set SCORE = "rmsd" if($?CORRELATE) set SCORE = " CC " # get unit cell from $right_pdb set CELL = `nawk '/^CRYST/{print $2,$3,$4,$5,$6,$7}' $right_pdb $wrong_pdb | head -1` if("$CELL" == "") then echo "ERROR: no CRYST card in $right_pdb" goto Help endif # try to get space group? if("$SG" == "") then set pdbSG = `nawk '/^CRYST/{print substr($0,56,12)}' $right_pdb $wrong_pdb | head -1` set SG = `nawk -v pdbSG="$pdbSG" -F "[\047]" 'pdbSG==$2{print;exit}' ${CLIBD}/symop.lib | nawk '{print $4}'` endif if("$SG" == "") then # hmm, throw an error or go on? ... set SG = P1 endif # check for H3/R3 weirdness set test = `echo $CELL | awk '{print ( $4+0==90 && $5+0==90 && $6+0==120 )}'` if("$SG" =~ R* && "$test" == "1" ) then # probably should use hexagonal system set SG = `echo $SG | awk '{gsub("R","H");print}'` endif if("$SG" =~ H* && "$test" == "0" ) then # probably should use rhombohedral system set SG = `echo $SG | awk '{gsub("H","R");print}'` endif set CCSG = $SG # preemtive cleanup rm -f ${tempfile}_right[0-9][0-9][0-9].pdb >& /dev/null rm -f ${tempfile}_wrong[0-9][0-9][0-9].pdb >& /dev/null if (! $?BYFILE) then # break up "right" file into its respective chains cat $right_pdb |\ nawk '/^ATOM/{resnum=substr($0, 23, 4)+0;segid = substr($0, 22, 1); \ if(last_segid=="")last_segid=segid;\ if(resnum tempfile chain ".pdb";\ chain=sprintf("%03d", chain+1)}\ /^ATOM/||/^HETATM/{print "ATOM "substr($0,7,66) > tempfile chain ".pdb"}\ END{print "REMARK chain", segid > tempfile chain ".pdb"}' # chains are now separated into files # ${tempfile}_right###.pdb # break up "wrong" file into its respective chains cat $wrong_pdb |\ nawk '/^ATOM/{resnum=substr($0, 23, 4)+0;segid = substr($0, 22, 1); \ if(last_segid=="")last_segid=segid;\ if(resnum tempfile chain ".pdb";\ chain=sprintf("%03d", chain+1)}\ /^ATOM/||/^HETATM/{print "ATOM "substr($0,7,66) > tempfile chain ".pdb"}\ END{print "REMARK chain", segid > tempfile chain ".pdb"}' # chains are now separated into files # ${tempfile}_wrong###.pdb else echo "REMARK CHAIN _" >! ${tempfile}_right001.pdb cat $right_pdb |\ nawk '/^ATOM/||/^HETATM/{print "ATOM "substr($0,7,66)}' |\ cat >> ${tempfile}_right001.pdb echo "REMARK CHAIN _" >> ${tempfile}_right001.pdb echo "REMARK CHAIN _" >! ${tempfile}_wrong001.pdb cat $wrong_pdb |\ nawk '/^ATOM/||/^HETATM/{print "ATOM "substr($0,7,66)}' |\ cat >> ${tempfile}_wrong001.pdb echo "REMARK CHAIN _" >> ${tempfile}_wrong001.pdb endif # add the proper headers foreach file ( ${tempfile}_right[0-9][0-9][0-9].pdb ${tempfile}_wrong[0-9][0-9][0-9].pdb ) echo "END" >> $file # calculate the center of mass pdbset xyzin $file xyzout ${tempfile}.pdb << EOF >! ${tempfile}.log CELL $CELL CHAIN " " COM EOF egrep "^REMARK" $file >! ${tempfile}new.pdb egrep -v "REMARK" ${tempfile}.pdb >> ${tempfile}new.pdb mv ${tempfile}new.pdb ${tempfile}.pdb # make a pdb file of an atom at the COM of this file nawk '/^CRYST/ || /^SCALE/' ${tempfile}.pdb >! ${tempfile}COM.pdb cat ${tempfile}.log |\ nawk '$1=="Center"{printf "ATOM 1 CA GLY 1 %8.3f%8.3f%8.3f 1.00 80.00\n", $4, $5, $6;exit}' |\ cat >> ${tempfile}COM.pdb echo "EOF" >> ${tempfile}COM.pdb # get fractional version of the COM too coordconv XYZIN ${tempfile}COM.pdb XYZOUT ${tempfile}.xyz << EOF >& /dev/null CELL $CELL INPUT PDB OUTPUT FRAC EOF # put this center as a remark in the PDB file cat ${tempfile}.log |\ nawk '$1=="Center"{print "COM", $4, $5, $6}' |\ cat - ${tempfile}.xyz |\ nawk '{printf "REMARK " $0; getline; print " ", $2, $3, $4}' |\ cat >! $file cat ${tempfile}.pdb |\ nawk '/^REMARK/ && $NF=="chain"{print "REMARK chain _";next}\ /^REMARK/{print}' >> $file cat ${tempfile}.pdb |\ nawk '! /REMARK/{print substr($0,1,66)}' >> $file rm -f ${tempfile}.pdb ${tempfile}COM.pdb ${tempfile}.xyz ${tempfile}.log >& /dev/null # now "$file" has its center of mass in its header end # display stats echo -n "reference chains: " foreach file ( ${tempfile}_right[0-9][0-9][0-9].pdb ) nawk '/^REMARK chain/{printf "%s ", $3;exit}' $file end echo " ( "`basename $right_pdb`" )" echo -n " subject chains: " foreach file ( ${tempfile}_wrong[0-9][0-9][0-9].pdb ) nawk '/^REMARK chain/{printf "%s ", $3;exit}' $file end echo " ( "`basename $wrong_pdb`" )" echo "CELL $CELL" echo "SG $SG" echo "" ################################ # get tranformations from a space group # get symmetry operations from space group cat ${CLIBD}/symop.lib |\ nawk -v SG=$SG '/[XYZ]/ && ! /[PCIFRH]/ && p==1 {print $1 $2 $3 $4}\ $5 ~ /^PG/{p=0} $4 == SG{p=1}' |\ cat >! ${tempfile}symops set symops = `cat ${tempfile}symops` # get origin-shift operators from a space group (at the bottom of this script) set table = $0 cat $table |\ nawk '/TABLE OF ALLOWED ORIGIN SHIFTS/,/END OF THE TABLE/' |\ nawk -v SG=$SG '/^[PCIFRH]/ && $0 ~ SG" "{getline; while(NF>0){\ print $1 "," $2 "," $3;getline};exit}' |\ cat >! ${tempfile}origins set origins = `cat ${tempfile}origins` #set origins = `awk 'BEGIN{for(y=0;y<1;y+=0.01){print "0,"y",0";print "1/2,"y",0";print "0,"y",1/2";print "1/2,"y",1/2"}}'` if($?NO_ORIGINS || "$origins" == "") then set origins = "0,0,0" set best_origin = "0,0,0" set CCSG = 1 endif rm -f ${tempfile}origins rm -f ${tempfile}symops # use deconvolution to find optimal shift if(1) then echo "deconvoluting maps..." # do a coarser map grid sampling than normal (~ 3A resolution) set reso = 2 # make a "mask" of all possible origin shifts echo -n "" >! ${tempfile}xyz.txt if("$SG" == "P1") then echo "0 0 0" >> ${tempfile}xyz.txt else foreach origin ( $origins ) # break up the origin string set Xf = `echo "$origin" | nawk -F "," '{print $1}'` set Yf = `echo "$origin" | nawk -F "," '{print $2}'` set Zf = `echo "$origin" | nawk -F "," '{print $3}'` set X = `echo "$Xf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` set Y = `echo "$Yf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` set Z = `echo "$Zf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` echo "$X $Y $Z $Xf $Yf $Zf" |\ awk '{print $1+0,$2+0,$3+0}\ $4=="x"{for(x=0;x<1;x+=0.001)print x,$2+0,$3+0}\ $5=="y"{for(y=0;y<1;y+=0.001)print $1+0,y,$3+0}\ $6=="z"{for(z=0;z<1;z+=0.001)print $1+0,$2+0,z}\ $4=="x=y=z"{for(x=0;x<2;x+=0.001)print x,x,x}' |\ cat >> ${tempfile}xyz.txt end endif cat ${tempfile}xyz.txt |\ awk 'NF>2{++n; printf "%5d%10.5f%10.5f%10.5f%10.5f%5.2f%5d%10d%2s%3s%3s %1s\n", \ n, $1, $2, $3, 80, 1, "38", n, "H", "", "IUM", " "}' |\ cat >! ${tempfile}.frac coordconv XYZIN ${tempfile}.frac \ XYZOUT ${tempfile}originmask.pdb << EOF-conv >& /dev/null CELL $CELL INPUT FRAC OUTPUT PDB ORTH 1 END EOF-conv sfall xyzin ${tempfile}originmask.pdb hklout ${tempfile}originmask.mtz << EOF-sfall > /dev/null MODE sfcalc xyzin CELL $CELL SYMM 1 RESO $reso EOF-sfall fft hklin ${tempfile}originmask.mtz mapout ${tempfile}originmask.map << EOF >! ${tempfile}.log labin F1=FC PHI=PHIC scale F1 1 80 reso $reso symm P1 EOF set scale = `awk '/Maximum density/ && $NF>0{print 1/$NF}' ${tempfile}.log` rm -f ${tempfile}.log if("$scale" == "") set scale = 1 echo "scale factor $scale 0" |\ mapmask mapin ${tempfile}originmask.map mapout ${tempfile}new.map > /dev/null mv ${tempfile}new.map ${tempfile}originmask.map echo "scale factor 0 1" |\ mapmask mapin ${tempfile}originmask.map mapout ${tempfile}one.map > /dev/null # echo "mask cut 0" |\ # mapmask mapin ${tempfile}originmask.map mskout ${tempfile}originmask.msk > /dev/null # echo "maps mult" |\ # mapmask mapin ${tempfile}one.map mskin ${tempfile}originmask.msk \ # mapout ${tempfile}originmask.map > /dev/null if("$SG" == "P1") then # P1 origin is good everywhere, so no mask cp ${tempfile}one.map ${tempfile}originmask.map > /dev/null endif rm -f ${tempfile}one.map ${tempfile}originmask.mtz rm -f ${tempfile}originmask.pdb ${tempfile}.frac ${tempfile}xyz.txt # make an all-carbon version of this model echo "CELL $CELL" | pdbset xyzin $right_pdb xyzout ${tempfile}.pdb >! ${tempfile}.log if($status) exit 9 egrep "^CRYST1|^ATOM" ${tempfile}.pdb |\ awk '/^ATOM/{$0="ATOM 1 CA ALA 1 "substr($0,31,25)" 1.00 80.00"} {print}' |\ cat >! ${tempfile}mapme.pdb sfall xyzin ${tempfile}mapme.pdb hklout ${tempfile}right.mtz << EOF-sfall >! ${tempfile}.log MODE sfcalc xyzin CELL $CELL SYMM $SG RESO $reso EOF-sfall if($status) exit 9 echo "CELL $CELL" | pdbset xyzin $wrong_pdb xyzout ${tempfile}.pdb > /dev/null egrep "^CRYST1|^ATOM" ${tempfile}.pdb |\ awk '/^ATOM/{$0="ATOM 1 CA ALA 1 "substr($0,31,25)" 1.00 80.00"} {print}' |\ cat >! ${tempfile}mapme.pdb sfall xyzin ${tempfile}mapme.pdb hklout ${tempfile}wrong.mtz << EOF-sfall >! ${tempfile}.log MODE sfcalc xyzin CELL $CELL SYMM $SG RESO $reso EOF-sfall if($status) exit 9 rm -f ${tempfile}del.mtz sftools << EOF > /dev/null read ${tempfile}right.mtz read ${tempfile}wrong.mtz set labels Fright PHIright Fwrong PHIwrong calc ( COL Fq PHIdel ) = ( COL Fright PHIright ) ( COL Fwrong PHIwrong ) / calc COL W = COL Fq select COL Fq > 1 calc COL W = 1 COL Fq / select all calc F COL Fdel = COL W 0.5 ** write ${tempfile}del.mtz col Fdel PHIdel y stop EOF fft hklin ${tempfile}del.mtz mapout ${tempfile}del.map << EOF >! ${tempfile}.log labin F1=Fdel PHI=PHIdel reso $reso symm P1 EOF # make sure that we define "sigma" for the unmasked map echo "scale sigma 1 0" |\ mapmask mapin ${tempfile}del.map mapout ${tempfile}zscore.map > /dev/null mapmask mapin1 ${tempfile}zscore.map mapin2 ${tempfile}originmask.map \ mapout ${tempfile}pickme.map << EOF >! ${tempfile}.log mode mapin1 mapin2 maps mult EOF peakmax mapin ${tempfile}pickme.map xyzfrc ${tempfile}peak.txt << EOF >! ${tempfile}.log output frac # threshold 3 numpeaks 10 EOF set scale = `awk '/peaks higher than the threshold/{print $(NF-1)/$9}' ${tempfile}.log` echo "fractional origin shift Z-score" cat ${tempfile}peak.txt |\ awk -v scale=$scale '/ATOM/{printf "%7.4f %7.4f %7.4f %s\n", $3,$4,$5,$6/scale}' |\ awk 'NR==1{max=$4} $4>max/3 || $4>3' |\ tee ${tempfile}likely_origins.txt # clean up a bit rm -f ${tempfile}right.mtz ${tempfile}wrong.mtz rm -f ${tempfile}del.mtz ${tempfile}del.map rm -f ${tempfile}originmask.map ${tempfile}pickme.map ${tempfile}zscore.map rm -f ${tempfile}peak.txt ${tempfile}.log # now integrate these shifts with the "official" origin list echo "ORIGINS $origins" |\ cat - ${tempfile}likely_origins.txt |\ awk '/^ORIGINS/{\ for(i=2;i<=NF;++i){\ ++n;\ split($i,xyz,",");\ for(k in xyz){\ if(xyz[k]~/\//){\ split(xyz[k],w,"/");\ xyz[k]=w[1]/w[2];\ }\ }\ x0[n]=xyz[1];y0[n]=xyz[2];z0[n]=xyz[3];\ }\ }\ NF==4{x=$1;y=$2;z=$3;score=$4;\ minfd=999;\ for(i in x0){\ xp=x0[i];yp=y0[i];zp=z0[i];\ if(xp=="x")xp=x;\ if(yp=="y")yp=y;\ if(zp=="z")zp=z;\ if(xp=="x=y=z")xp=yp=zp=x;\ origin[i]= xp","yp","zp;\ fdx=sqrt((x-xp)^2);if(fdx>0.9)fdx-=1;\ fdy=sqrt((y-yp)^2);if(fdy>0.9)fdy-=1;\ fdz=sqrt((z-zp)^2);if(fdz>0.9)fdz-=1;\ fd=sqrt(fdx^2+fdy^2+fdz^2);\ if(fd! ${tempfile}neworigins.txt set origins = `awk '! seen[$3]{print $3} {++seen[$3]}' ${tempfile}neworigins.txt` rm -f ${tempfile}likely_origins.txt ${tempfile}neworigins.txt endif ######################################## # now run through all origins, and chain pairings echo "chain origin " echo "r s dX dY dZ symop $SCORE" again: echo -n "" >! ${tempfile}scores foreach right_chain ( ${tempfile}_right[0-9][0-9][0-9].pdb ) # get fractional coordinate limits that cover the "right" chain coordconv xyzin $right_chain xyzout ${tempfile}.xyz << EOF >> /dev/null INPUT PDB OUTPUT FRAC END EOF cat ${tempfile}.xyz |\ nawk -v del=0.5 'BEGIN{xmin=ymin=zmin=99999999} \ $2xmax{xmax=$2}\ $3ymax{ymax=$3}\ $4zmax{zmax=$4}\ END{print xmin-del, xmax+del, ymin-del, ymax+del, zmin-del, zmax+del}' |\ cat >! ${tempfile}.xyzlim set xyzlim = `cat ${tempfile}.xyzlim` rm -f ${tempfile}.xyzlim rm -f ${tempfile}.xyz if($?CORRELATE) then # we want to do comparison with maps, not atoms # do a coarser map grid sampling than normal (~ 3A resolution) set reso = 3 set fakeCELL = `echo $CELL $reso | nawk '{print $1/$NF/2,$2/$NF/2,$3/$NF/2,$4,$5,$6}'` set BADD = `echo $reso | nawk '{printf "%d", 79*($1/3)^2}'` echo "CELL $fakeCELL" | pdbset xyzin $right_chain xyzout ${tempfile}.pdb > /dev/null # make an all-carbon version of this model cat ${tempfile}.pdb |\ awk '/^ATOM/{$0="ATOM 1 CA ALA 1 "substr($0,31,25)" 1.00 80.00"} {print}' |\ cat >! ${tempfile}mapme.pdb sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}right.map << EOF-sfall > /dev/null MODE ATMMAP CELL $fakeCELL SYMM $CCSG EOF-sfall # recover coarse grid spacing set GRID = `echo "GO" | mapdump MAPIN ${tempfile}right.map | nawk '/Grid sampling/{print $(NF-2), $(NF-1), $NF; exit}'` rm -f ${tempfile}.pdb >& /dev/null # re-calculate the "right" map with reduced grid cat $right_chain |\ awk '/^ATOM/{++n;$0="ATOM 1 CA ALA 1 "substr($0,31,25)" 1.00 80.00"} {print}' |\ awk '/^ATOM/{++n;}\ n==1{++n;\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 0.01 99.00";\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 -0.01 99.00";}\ {print}' |\ cat >! ${tempfile}mapme.pdb sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}right.map << EOF-sfall > /dev/null MODE ATMMAP CELL $CELL SYMM $CCSG GRID $GRID BADD $BADD EOF-sfall rm -f ${tempfile}mapme.pdb >& /dev/null # determine grid spacing to use for "wrong" map set GRID = `echo "GO" | mapdump MAPIN ${tempfile}right.map | nawk '/Grid sampling/{print $(NF-2), $(NF-1), $NF; exit}'` # set xyzlim = "0 1 0 1 0 1" endif # now loop over the wrong chains foreach wrong_chain ( ${tempfile}_wrong[0-9][0-9][0-9].pdb ) # retrieve info from file headers set right_COM = `nawk '/^REMARK COM/{print $3, $4, $5;exit}' $right_chain` set wrong_COM = `nawk '/^REMARK COM/{print $3, $4, $5;exit}' $wrong_chain` set shift_COM = `echo "$wrong_COM $right_COM" | nawk '{print $4-$1,$5-$2,$6-$3}'` set right_COM_frac = `nawk '/^REMARK COM/{print $6, $7, $8;exit}' $right_chain` set wrong_COM_frac = `nawk '/^REMARK COM/{print $6, $7, $8;exit}' $wrong_chain` set shift_COM_frac = `echo "$wrong_COM_frac $right_COM_frac" | nawk '{print $4-$1,$5-$2,$6-$3}'` set right_chain_ID = `nawk '/^REMARK chain/{print $3}' $right_chain` set wrong_chain_ID = `nawk '/^REMARK chain/{print $3}' $wrong_chain` if("$right_chain_ID" == "") set right_chain_ID = "_" if("$wrong_chain_ID" == "") set wrong_chain_ID = "_" # try every possible origin choice foreach origin ( $origins ) # break up the origin string set Xf = `echo "$origin" | nawk -F "," '{print $1}'` set Yf = `echo "$origin" | nawk -F "," '{print $2}'` set Zf = `echo "$origin" | nawk -F "," '{print $3}'` set X = `echo "$Xf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` set Y = `echo "$Yf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` set Z = `echo "$Zf" | nawk -F "/" 'NF==2{$1/=$2} {printf "%.10f",$1}'` # add translation along any polar axes #set X = `echo "$opt_frac_shift $Xf $X" | nawk '/x/{$NF=$1} {print $NF}'` #set Y = `echo "$opt_frac_shift $Yf $Y" | nawk '/y/{$NF=$2} {print $NF}'` #set Z = `echo "$opt_frac_shift $Zf $Z" | nawk '/z/{$NF=$3} {print $NF}'` # calculate how to "level" the centers of the atom constellations # NOTE: this is not a good idea if the chains are not rigid-body related #set polar_slip = `echo "$opt_frac_shift $Xf $Yf $Zf" | nawk '! /x/{$1=0} ! /y/{$2=0} ! /z/{$3=0} {print $1,$2,$3}'` # now see which (if any) symops/cell translations will bring the # shifted COM of "wrong_chain" anywhere near the COM of the "right" chain # apply current origin shift to "wrong" COM set new_COM = `echo "$wrong_COM_frac $X $Y $Z" | nawk '{print $1+$4, $2+$5, $3+$6}'` gensym << EOF >! ${tempfile}.log CELL $CELL SYMM $SG atom X $new_COM XYZLIM $xyzlim EOF # only use symops that showed up in the gensym result cat ${tempfile}.log |\ nawk '/List of sites/,/atoms generated from/' |\ nawk 'NF>10{print $NF, $2, $3, $4, $5, $6, $7}' |\ sort -u -k1,2 >! ${tempfile}symop_center # format: symop xf yf zf X Y Z if($?CORRELATE && ! $?best_origin) then echo "1 $right_COM_frac $new_COM" >! ${tempfile}symop_center endif # loop over the symmetry operations foreach line ( `nawk '{print NR}' ${tempfile}symop_center` ) # skip all other origins if a good one has been found if(("$good_origin" != "")&&("$good_origin" != "$X $Y $Z") && "$CCSG" != "1") then # echo "skipping: $good_origin $X $Y $Z" continue endif # skip rest of "right_chain" if we have already found a match if("$good_right" == "$right_chain") then # echo "skipping: $good_right == $right_chain" continue endif if("$good_wrong" == "$wrong_chain") then # echo "skipping: $good_wrong == $wrong_chain" continue endif # retrieve symmetry operation set symop = `nawk -v line=$line 'NR==line{print $1}' ${tempfile}symop_center` set symop = $symops[$symop] # progress meter echo "$right_chain_ID $wrong_chain_ID $Xf $Yf $Zf $symop" |\ nawk '{printf "%s vs %s @ %3s %3s %3s %-20s", $1, $2, $3,$4,$5, $6}' # move the PDB to the new position pdbset xyzin $wrong_chain \ xyzout ${tempfile}symmed.pdb << EOF >! ${tempfile}.log CELL $CELL SYMGEN $symop SHIFT FRAC $X $Y $Z COM EOF set new_COM = `nawk '$1=="Center"{print $4, $5, $6}' ${tempfile}.log` # get fractional version too nawk '/^CRYST/ || /^SCALE/' $wrong_chain >! ${tempfile}COM.pdb cat ${tempfile}.log |\ nawk '$1=="Center"{printf "ATOM 1 CA GLY 1 %8.3f%8.3f%8.3f 1.00 80.00\n", $4, $5, $6;exit}' |\ cat >> ${tempfile}COM.pdb echo "END" >> ${tempfile}COM.pdb coordconv XYZIN ${tempfile}COM.pdb XYZOUT ${tempfile}.xyz << EOF > /dev/null CELL $CELL INPUT PDB OUTPUT FRAC EOF set new_COM_frac = `nawk '{print " ", $2, $3, $4}' ${tempfile}.xyz` # now find the nearest cell translation to bring these COMs together set cell_shift = `echo "$right_COM_frac $new_COM_frac" | nawk '{printf "%.0f %.0f %.0f", $1-$4+0, $2-$5+0, $3-$6+0}'` # move the "symmed" "wrong" chain into the right cell pdbset xyzin ${tempfile}symmed.pdb \ xyzout ${tempfile}moved.pdb << EOF >! ${tempfile}.log CELL $CELL SHIFT FRAC $cell_shift EOF # calclulate RMS distance between chains set score = `nawk -f ${tempfile}rmsd.awk $right_chain ${tempfile}moved.pdb | nawk '/RMSD\(all/{print -$2}'` if($?CORRELATE) then # make sure all atoms are residues are readabel cat ${tempfile}moved.pdb |\ awk '/^ATOM/{$0="ATOM 1 CA ALA 1 "substr($0,31,25)" 1.00 80.00"} {print}' |\ awk '/^ATOM/{++n;}\ n==1{++n;\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 0.01 99.00";\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 -0.01 99.00";}\ {print}' |\ cat >! ${tempfile}mapme.pdb # create a map of these atoms sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}moved.map << EOF-sfall > /dev/null MODE ATMMAP CELL $CELL SYMM $CCSG GRID $GRID BADD $BADD EOF-sfall rm -f ${tempfile}mapme.pdb >& /dev/null # compare to the "right" map echo "correlate section" |\ overlapmap mapin1 ${tempfile}right.map \ mapin2 ${tempfile}moved.map mapout /dev/null |\ nawk '/Total correlation/{print $NF}' >! ${tempfile}correlation set score = `nawk '{print $1}' ${tempfile}correlation` # display CC echo "$score" | nawk '$1!~/[^0-9]/{print " -";exit} {printf "%5.2f\n", $1}' else # display rmsd echo "$score" | nawk '$1~/[0-9]/{printf "%5.1f\n", -$1; exit} {print "no identities"}' endif rm -f ${tempfile}symmed.pdb ${tempfile}moved.pdb if("$score" == "") continue # see if fit is "good enough" set goodenough = `echo $score | nawk '{print ($1<1)}'` if($?CORRELATE) set goodenough = `echo $score | nawk '{print ($1>0.8)}'` if(($goodenough == 1)&&($?SPEEDUP)) then echo "thats good enough..." set good_origin = "$X $Y $Z" set good_right = "$right_chain" set good_wrong = "$wrong_chain" endif # make a running log echo "$right_chain $wrong_chain $Xf $Yf $Zf $X $Y $Z $cell_shift $symop $score " >> ${tempfile}scores end end end end echo "" # select out the "best" matches (on the same origin) sort -nr -k13 ${tempfile}scores |\ nawk '! seen[$2, $3, $4, $5]{++seen[$2, $3, $4, $5];\ score[$3 " " $4 " " $5]+=$NF;++count[$3 " " $4 " " $5]}\ END{for(origin in score) print score[origin], count[origin], origin}' |\ sort -nr >! ${tempfile}best_scores set best_origin = `head -1 ${tempfile}best_scores | nawk '{print $3, $4, $5}'` if($?CORRELATE && "$CCSG" != "1" && ! $?NO_ORIGINS) then set origins = `echo $best_origin | awk '{print $1","$2","$3}'` set CCSG = 1 set good_right = "" set good_wrong = "" goto again endif # extract the best operators consistent with this origin sort -nr -k13 ${tempfile}scores |\ nawk -v best_origin="$best_origin" '$3" "$4" "$5==best_origin{print}' |\ nawk '! seen[$2]{print; seen[$2]=1}' |\ sort >! ${tempfile}best_scores #format r_chain w_chain 1/2 1/2 1/2 0.000 0.000 0.000 1 0 1 X,Y,Z rmsd echo -n "" >! ${tempfile}out.pdb foreach line ( `nawk '{print NR}' ${tempfile}best_scores` ) # retrieve chain info set right_chain = `nawk -v line=$line 'NR==line{print $1}' ${tempfile}best_scores` set wrong_chain = `nawk -v line=$line 'NR==line{print $2}' ${tempfile}best_scores` set right_chain_ID = `nawk '/^REMARK chain/{print $3}' $right_chain` set wrong_chain_ID = `nawk '/^REMARK chain/{print $3}' $wrong_chain` if("$right_chain_ID" == "") set right_chain_ID = "_" if("$wrong_chain_ID" == "") set wrong_chain_ID = "_" # retrieve transofmation info set X = `nawk -v chain=$wrong_chain '$2==chain{printf "%.2f", $6}' ${tempfile}best_scores` set Y = `nawk -v chain=$wrong_chain '$2==chain{printf "%.2f", $7}' ${tempfile}best_scores` set Z = `nawk -v chain=$wrong_chain '$2==chain{printf "%.2f", $8}' ${tempfile}best_scores` set x = `nawk -v chain=$wrong_chain '$2==chain{print $6+$9 }' ${tempfile}best_scores` set y = `nawk -v chain=$wrong_chain '$2==chain{print $7+$10}' ${tempfile}best_scores` set z = `nawk -v chain=$wrong_chain '$2==chain{print $8+$11}' ${tempfile}best_scores` set symop = `nawk -v chain=$wrong_chain '$2==chain{print $12}' ${tempfile}best_scores` # retrieve score set score = `nawk -v chain=$wrong_chain '$2==chain{print -$13}' ${tempfile}best_scores` if($?CORRELATE) set score = `echo $score | nawk '{print -$1}'` if("$score" == "") then echo "unable to match $wrong_pdb chain $wrong_chain_ID to any chain in $right_pdb" continue endif # apply it echo "$x $y $z $symop" |\ nawk '{printf "applying %5.2f %5.2f %5.2f %-15s\n", $1, $2, $3, $4}' echo "to $wrong_pdb chain $wrong_chain_ID --> $outfile chain $right_chain_ID ($SCORE = $score)" if("$right_chain_ID" == "_") set right_chain_ID = " " pdbset xyzin $wrong_chain xyzout ${tempfile}moved.pdb << EOF >> /dev/null CELL $CELL CHAIN "$right_chain_ID" SYMGEN $symop SHIFT FRAC $x $y $z EOF # update the output file egrep "^ATOM" ${tempfile}moved.pdb >> ${tempfile}out.pdb # warn about different origins if(! $?last_origin) set last_origin if(("$last_origin" != "$X $Y $Z")&&("$last_origin" != "")) then echo "WARNING: $wrong_pdb chain $wrong_chain_ID is on a different origin! " endif set last_origin = "$X $Y $Z" end echo "END" >> ${tempfile}out.pdb echo "CELL $CELL" |\ pdbset xyzin ${tempfile}out.pdb xyzout $outfile > /dev/null if($?BYFILE) then # no point in messing around, just apply the shift to the original file echo "CELL $CELL" |\ pdbset xyzin $wrong_pdb xyzout ${tempfile}out.pdb > /dev/null pdbset xyzin ${tempfile}out.pdb xyzout $outfile << EOF > /dev/null CELL $CELL SYMGEN $symop SHIFT FRAC $x $y $z EOF endif if($?SPEEDUP) exit # report final, overall score if($?CORRELATE) then # calculate the overall map correlation set logfile = /dev/null # create a simplified file for SFALL cat $right_pdb |\ awk '/^CRYST|^END|^REM/{print}\ /^ATOM/ || /^HETATM/{++n;printf("ATOM 1 CA ALA %5d %25s 1.00 80.00\n",n,substr($0,31,25))}' |\ awk '/^ATOM/{++n;}\ n==1{++n;\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 0.01 99.00";\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 -0.01 99.00";}\ {print}' |\ cat >! ${tempfile}mapme.pdb sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}right.map << EOF-sfall > $logfile MODE ATMMAP CELL $CELL SYMM $SG EOF-sfall # also record the "right" atom codes cat $right_pdb |\ awk '/^ATOM/{++n;printf "code %04d %s\n",n,$0}' |\ cat >! ${tempfile}atomcodes.txt # recover grid spacing set GRID = `echo "GO" | mapdump MAPIN ${tempfile}right.map | nawk '/Grid sampling/{print $(NF-2), $(NF-1), $NF; exit}'` # calculate the "label" map so we can assign atom-specific correlations sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}label.map << EOF-sfall > /dev/null MODE ATMMAP RESMOD CELL $CELL SYMM $SG GRID $GRID EOF-sfall # calculate map for "moved" model cat $outfile |\ awk '/^CRYST|^END|^REM/{print}\ /^ATOM/ || /^HETATM/{++n;printf("ATOM 1 CA ALA %5d %25s 1.00 80.00\n",n,substr($0,31,25))}' |\ awk '/^ATOM/{++n;}\ n==1{++n;\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 0.01 99.00";\ print "ATOM 0 CA SHT 0 0.000 0.000 0.000 -0.01 99.00";}\ {print}' |\ cat >! ${tempfile}mapme.pdb # also record the "wrong" atom codes cat $outfile |\ awk '/^ATOM/{++n;printf "code %04d %s\n",n,$0}' |\ cat >! ${tempfile}wrong_atomcodes.txt sfall xyzin ${tempfile}mapme.pdb mapout ${tempfile}moved.map << EOF-sfall > $logfile MODE ATMMAP CELL $CELL SYMM $SG GRID $GRID EOF-sfall # why do they make me do these things... sfall map can sometimes be too small #dd if=/dev/zero bs=1024 count=10000 >>& ${tempfile}moved.map # sfall xyzin ${tempfile}mapme.pdb HKLOUT ${tempfile}moved.mtz << EOF-sfall > $logfile # MODE SFCALC XYZIN # CELL $CELL # SYMM $CCSG #EOF-sfall # fft hklin ${tempfile}moved.mtz mapout ${tempfile}moved.map << EOF-fft > $logfile # LABIN F1=FC PHI=PHIC # GRID $GRID #EOF-fft rm -f ${tempfile}mapme.pdb >& /dev/null # compare to the "right" map echo "correlate residue" |\ overlapmap mapin1 ${tempfile}right.map \ mapin2 ${tempfile}moved.map \ mapin3 ${tempfile}label.map mapout /dev/null |\ nawk '/Main-corr-coef/,/Total/{print}' | tee ${tempfile}correlation |\ nawk '$1+0>0{print}' |\ nawk '$2+0>0{print $1,$2;next} $3+0>0{print $1,$3}' |\ nawk '{print "ATOMCC",$0}' >! ${tempfile}atom_corr.txt # format: ATOMCC rightatomnum CC # encode the atomic CCs as an occupancy in a new PDB file cat ${tempfile}atomcodes.txt ${tempfile}atom_corr.txt $right_pdb |\ nawk '/^code /{name[$2+0]=substr($0,11,26);next} /^ATOMCC /{occ[$2]=$3;next} ! /^ATOM/ {print}\ /^ATOM/ || /^HETATM/{++n;\ if(name[n]=="" || occ[n]+0<0.01){name[n]=substr($0,1,26);occ[x]=substr($0,55,6)+0};\ printf "%26s%28s%6.2f%s\n", name[n],substr($0,27,28),occ[n],substr($0,61)}' |\ cat >! newlabel_$outfile set score = `nawk '/Total/{print $NF;print $(NF-1)}' ${tempfile}correlation | sort -nr | head -1` rm -f ${tempfile}right.map >& /dev/null rm -f ${tempfile}moved.map >& /dev/null rm -f ${tempfile}label.map >& /dev/null rm -f ${tempfile}correlation >& /dev/null rm -f ${tempfile}atom_corr.txt >& /dev/null rm -f ${tempfile}atomcodes.txt >& /dev/null rm -f ${tempfile}wrong_atomcodes.txt >& /dev/null else # the overall RMSD is the score set score = `nawk -f ${tempfile}rmsd.awk $right_pdb $outfile | nawk '/RMSD\(all/{print $2} /no atom pairs/{print "no identities"}'` endif echo "Overall ${SCORE}: $score" #clean up if($?debug) exit rm -f ${tempfile}rmsd.awk >& /dev/null rm -f ${tempfile}out.pdb >& /dev/null rm -f ${tempfile}moved.pdb >& /dev/null rm -f ${tempfile}best_scores >& /dev/null rm -f ${tempfile}scores >& /dev/null rm -f ${tempfile}.log >& /dev/null rm -f ${tempfile}.xyz >& /dev/null rm -f ${tempfile}COM.pdb >& /dev/null rm -f ${tempfile}symop_center >& /dev/null rm -f ${tempfile}_wrong???.pdb >& /dev/null rm -f ${tempfile}_right???.pdb >& /dev/null exit # Notes on hand flipping cat rh.pdb |\ awk -v SG=$SG '/^CRYST1/{a=$2;c=$4;\ if(SG=="F4132"){dorx=dory=dorz=a*0.25};\ if(SG=="I41"){dorx=a*0.5};\ if(SG=="I4122"){dorx=a*0.5;dorz=c*0.25};\ } ! /^ATOM|^HETAT/{print} /^ATOM|^HETAT/{;\ x=substr($0,31,8)+0;y=substr($0,39,8)+0;z=substr($0,47,8)+0;\ printf("%s%8.3f%8.3f%8.3f%s\n", substr($0,1,30),dorx-x,dory-y,dorz-z,substr($0,55))}' |\ cat >! lh.pdb ################################################################################ Setup: set good_origin = "" set good_right = "" set good_wrong = "" foreach arg ( $* ) if(("$arg" =~ *.pdb)||("$arg" =~ *.brk)) then # warn about probable mispellings if(! -e "$arg") then echo "WARNING: $arg does not exist" continue endif # make sure its really a pdb file egrep -l "^ATOM |HETATM " "$arg" >& /dev/null if($status) then echo "WARNING: $arg contains no atoms! " continue endif if(-e "$right_pdb") then set wrong_pdb = "$arg" else set right_pdb = "$arg" endif continue endif # space group if("$arg" =~ [PpCcIiFfRrHh][1-6]*) then set temp = `echo $arg | nawk '{print toupper($1)}'` if($?CLIBD) then set temp = `nawk -v SG=$temp '$4 == SG {print $4}' $CLIBD/symop.lib | head -1` endif if("$temp" != "") then # add this SG to the space group list set SG = "$temp" endif endif # flags and options if("$arg" == "correlate") then # correlation instead of RMSD set CORRELATE endif if("$arg" == "nochains" || "$arg" == "byfile") then # whole file is one chain set BYFILE endif if("$arg" == "noorigins") then # just symmetry search set NO_ORIGINS endif if("$arg" == "fast") then # skip several steps set SPEEDUP endif end if((! -e "$right_pdb")||(! -e "$wrong_pdb")) then goto Help endif # deploy rmsd awk program cat << EOF-script >! ${tempfile}rmsd.awk #! $nawk -f # # Calculate RMSD of atoms with the same name in two PDB files # # The PDB feild: # |<--- here -->| #ATOM 1 N ALA A 327 40.574 34.523 43.012 1.00 34.04 # # is used to determine if two atoms are a "pair" # BEGIN { if(! atom) atom = "CA" maxXYZ = maxdB = 0 max_atom_XYZ = max_atom_dB = 0 maxXYZ_ID = maxdB_ID = "-" max_atom_XYZ_ID = max_atom_dB_ID = "-" } /^ATOM/{ # read in values (watching for duplications) ID = substr(\$0,12,15) ++count[ID] if(count[ID] == 1) { # not initialized yet X[ID] = substr(\$0, 31, 8)+0 Y[ID] = substr(\$0, 39, 8)+0 Z[ID] = substr(\$0, 47, 8)+0 B[ID] = substr(\$0, 61, 6)+0 } if(count[ID] == 2) { ++pairs # seen this before, subtract values dX = X[ID] - substr(\$0, 31, 8) dY = Y[ID] - substr(\$0, 39, 8) dZ = Z[ID] - substr(\$0, 47, 8) dB[ID] = B[ID] - substr(\$0, 61, 6) # get drift (and add up squares of drifts) sqrD = dX*dX + dY*dY + dZ*dZ dXYZ[ID] = sqrt(sqrD) # remember maximum shifts if(dXYZ[ID] > maxXYZ) {maxXYZ = dXYZ[ID]; maxXYZ_ID = ID } if(dB[ID]*dB[ID] > maxdB*maxdB) {maxdB = dB[ID]; maxdB_ID = ID } # maintain mean-square sums sumXYZ += sqrD sumB += dB[ID]*dB[ID] # separate stats for special atom type if(ID ~ atom) { ++atom_pairs # maintain separate mean-square sums sum_atom_XYZ += sqrD sum_atom_B += dB[ID]*dB[ID] # remember maximum drifts too if(dXYZ[ID] > max_atom_XYZ) {max_atom_XYZ = dXYZ[ID]; max_atom_XYZ_ID = ID } if(dB[ID]*dB[ID] > max_atom_dB*max_atom_dB) {max_atom_dB = dB[ID]; max_atom_dB_ID = ID } } # debug output if(debug) { printf("%s moved %8.4f (XYZ) %6.2f (B)\\n", ID, dXYZ[ID], dB[ID]) } } if(count[ID] > 2) { print "WARNING: " ID " appeared more than twice! " } } END{ if((pairs+0 == 0)&&(! xlog)) { print "no atom pairs found" exit } rmsXYZ = sqrt(sumXYZ/pairs) rmsB = sqrt(sumB/pairs) if(atom_pairs+0 != 0) { rms_atom_XYZ = sqrt(sum_atom_XYZ/atom_pairs) rms_atom_B = sqrt(sum_atom_B/atom_pairs) } if(! xlog) { print pairs " atom pairs found" print "RMSD("atom" )= " rms_atom_XYZ " ("atom_pairs, atom " pairs)" print "RMSD(all)= " rmsXYZ " ("pairs" atom pairs)" print "RMSD(Bfac)= " rmsB print "MAXD(all)= " maxXYZ "\\tfor " maxXYZ_ID print "MAXD(Bfac)= " maxdB "\\tfor " maxdB_ID # final check for orphan atoms for(id in count) { if(count[id]<2) print "WARNING: " id " only found once" } } else { printf "%10.8f %10.8f %10.5f %10.8f %8.2f \\n", rms_atom_XYZ, rmsXYZ, rmsB, maxXYZ, maxdB } } EOF-script chmod a+x ${tempfile}rmsd.awk goto Return_from_Setup # TABLE OF ALLOWED ORIGIN SHIFTS These origin shifts were determined emprirically using 100 randomly-placed atoms that were shifted around with pdbset and checked with SFALL for identical amplitudes to the 0 0 0 origin. They should be correct for the CCP4 convention of symmetry. (I.E. R3 and R32 have a=b=c, but H3 and H32 have gamma=120) Note that 0.25 0.25 0.25 is not an allowed origin shift for F4132, despite that it may be convenient to check this when inverting the hand. P1 x y z P2 P21 C2 0 y 0 0 y 1/2 1/2 y 0 1/2 y 1/2 P222 P2221 P21212 P212121 C2221 C222 I222 I212121 F432 F4132 0 0 0 0 0 1/2 0 1/2 0 0 1/2 1/2 1/2 0 0 1/2 0 1/2 1/2 1/2 0 1/2 1/2 1/2 F222 F23 0 0 0 0 0 1/2 0 1/2 0 0 1/2 1/2 1/2 0 0 1/2 0 1/2 1/2 1/2 0 1/2 1/2 1/2 1/4 1/4 1/4 1/4 1/4 3/4 1/4 3/4 1/4 1/4 3/4 3/4 3/4 1/4 1/4 3/4 1/4 3/4 3/4 3/4 1/4 3/4 3/4 3/4 P4 P41 P42 P43 I4 I41 0 0 z 1/2 1/2 z P422 P4212 P4122 P41212 P4222 P42212 P4322 P43212 I422 I4122 0 0 0 0 0 1/2 1/2 1/2 0 1/2 1/2 1/2 P3 P31 P32 0 0 z 1/3 2/3 z 2/3 1/3 z R3 x=y=z H3 0 0 z 1/3 2/3 z 2/3 1/3 z P312 P3112 P3212 0 0 0 0 0 1/2 1/3 2/3 0 2/3 1/3 0 1/3 2/3 1/2 2/3 1/3 1/2 P321 P3121 P3221 P622 P6122 P6522 P6222 P6422 P6322 0 0 0 0 0 1/2 R32 0 0 0 1/2 1/2 1/2 H32 0 0 0 0 0 1/2 1/3 2/3 2/3 2/3 1/3 1/3 1/3 2/3 1/6 2/3 1/3 5/6 P6 P61 P65 P62 P64 P63 0 0 z P23 P213 P432 P4232 I432 P4332 P4132 I4132 0 0 0 1/2 1/2 1/2 END OF THE TABLE