#! /bin/tcsh -f # # retrieve CIF data from the PDB, convert it to MTZ format # # foreach pdbID ( $* ) set SG = "" set pdbid = `echo $pdbID | awk '{print tolower($1)}'` set PDBID = `echo $pdbID | awk '{print toupper($1)}'` if(-s ${pdbid}.mtz) then set test = `ls -l ${pdbid}.mtz | awk '{print ($5>1000)}'` if($test) then echo "already got ${pdbid}.mtz" exit endif endif set pdbin = ${pdbid}.pdb if(! -e "$pdbin") then getpdb.com $pdbid >! $pdbin endif if(! -s "$pdbin") then set ID = `echo $pdbid | awk '{print toupper($1)}'` wget -O test.gz http://www.rcsb.org/pdb/files/${ID}.pdb.gz > /dev/null gunzip test.gz mv test $pdbin endif if(-e ${pdbid}_orig.cif) cp -p ${pdbid}_orig.cif ${pdbid}.cif if(! $?NOCACHE && -s /data3/jamesh/all_sf/${pdbid}.cif) then cp -p /data3/jamesh/all_sf/${pdbid}.cif ${pdbid}.cif endif if(! $?NOCACHE && -s /data3/jamesh/all_sf/${pdbid}_orig.cif) then cp -p /data3/jamesh/all_sf/${pdbid}_orig.cif ${pdbid}_orig.cif endif if(! -s ${pdbid}.cif) then # check to see if there IS experimental data? wget -O ${pdbid}.cif 'http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=STRUCTFACT&compression=NO&structureId='$PDBID > /dev/null if(! -s ${pdbid}.cif) then set BAD = "no cif available." goto exit endif endif set CELL = `awk '/^CRYST1/{print $2,$3,$4,$5,$6,$7;exit}' ${pdbid}.pdb` if("$SG" == "") then set pdbSG = `awk '/^CRYST1/{SG=substr($0,56,14);if(length(SG)==14)while(gsub(/[^ ]$/,"",SG));print SG;exit}' ${pdbid}.pdb | head -1` if("$pdbSG" == "R 32") set pdbSG = "R 3 2" # if("$pdbSG" == "P 21") set pdbSG = "P 1 21 1" # if("$pdbSG" == "A 2") set pdbSG = "A 1 2 1" if("$pdbSG" == "I 21") set pdbSG = "I 1 21 1" if("$pdbSG" == "A 1") set pdbSG = "P 1" if("$pdbSG" == "P 21 21 2 A") set pdbSG = "P 21 21 2" if("$pdbSG" == "P 1-") set pdbSG = "P -1" if("$pdbSG" == "R 3 2" && $CELL[6] == 120.00) set pdbSG = "H 3 2" if("$pdbSG" == "R 3" && $CELL[6] == 120.00) set pdbSG = "H 3" set SG = `awk -v pdbSG="$pdbSG" -F "[\047]" 'pdbSG==$2 || pdbSG==$4{print;exit}' ${CLIBD}/symop.lib | awk '{print $4}'` endif if("$SG" == "") set SG = "$pdbSG" # does not seem to matter #set STATUS = "STATUS XPLOR" #set STATUS = "STATUS CCP4" set retries = 0 again: @ retries = ( $retries + 1 ) egrep "^CRYST1" ${pdbid}.pdb echo "CELL $CELL" echo "SYMM $SG" cif2mtz hklin ${pdbid}.cif hklout ${pdbid}.mtz << EOF >&! ${pdbid}_cif2mtz.log CELL $CELL SYMM "$SG" EOF if($status) then if($retries > 10) then set BAD = "too many retries " goto exit endif if(! -e ${pdbid}_orig_cif2mtz.log) cp ${pdbid}_cif2mtz.log ${pdbid}_orig_cif2mtz.log if(! -e ${pdbid}_orig.cif) cp ${pdbid}.cif ${pdbid}_orig.cif set test = `awk '/_diffrn.detail[^s]/' ${pdbid}_cif2mtz.log | wc -l` if("$test" != 0) then echo "renaming detail to details in cif file..." awk '{gsub(/_diffrn.detail[^s]/,"_diffrn.details");print}' ${pdbid}_orig.cif >! new.cif mv new.cif ${pdbid}.cif goto again endif if(! $?TRIED_CLEAN) then set TRIED_CLEAN echo "attempting to clean up cif file..." cat ${pdbid}.cif |\ awk '{gsub("[^[:print:]]",""); print}' |\ awk '/^loop_/{loop=NR;next} \ loop && /^_refln\./{++r;next} \ loop && ! (NF==r || NF==0 || /^#/){print loop,r,h;loop=r=h=0} \ NF==r || NF==0 || /^#/{++h} END{print loop,r,h}' |\ sort -k3gr >! loops.txt set bigloop = `head -1 loops.txt | awk '{print $1}'` set columns = `head -1 loops.txt | awk '{print $2}'` set lines = `head -1 loops.txt | awk '{print $2+$3+2}'` echo "refln loop at line $bigloop has $lines lines" egrep "^data" ${pdbid}.cif | head -1 >! new.cif tail -n +$bigloop ${pdbid}.cif |\ head -n $lines |\ awk -v columns=$columns '/^loop/ || /_refln/ || NF==columns' |\ cat >> new.cif mv new.cif ${pdbid}.cif goto again endif set test = `awk '/Unexpected context type for category /' ${pdbid}_cif2mtz.log | wc -l` if("$test" != 0) then set badlabel = `awk -F "'" '/not defined in dictionary/{print $4}' ${pdbid}_cif2mtz.log | head -1` set badstring = `echo $badlabel | awk -F "." '{print $NF}'` awk -F "." '/^_refl/{print $2,"USED"}' ${pdbid}.cif >! used_labels.txt strings ${CLIB}/cif_mmdic.lib | awk -F "." '/^_refln\./{print $NF}' >! okay_labels.txt cat used_labels.txt okay_labels.txt |\ awk '$2=="USED"{++used[$1];next} {n=split($1,w,"_");for(l in used){\ m=split(l,x,"_");matches=0;for(i in w)for(j in x){if(w[i]==x[j])++matches};\ if(matches) print matches,length($1),l,$1}}' |\ sort -k1g -k2gr >! matches.txt # use most obscure label possible set betterstring = `grep "$badstring" matches.txt | head -1 | awk '{print $NF}'` set betterstring = A_calc if("$badstring" != "" && "$betterstring" != "") then echo "renaming $badstring to $betterstring in cif file..." awk -v badstring=$badstring -v betterstring=$betterstring '{\ gsub(badstring,betterstring);print}' ${pdbid}.cif >! new.cif mv new.cif ${pdbid}.cif goto again endif endif set test = `awk '/Syntax error at line /' ${pdbid}_cif2mtz.log | wc -l` if("$test" != 0) then set badlines = `awk '/Syntax error at line /{print $5+0}' ${pdbid}_cif2mtz.log` echo "removing lines $badlines from cif file..." echo "$badlines" | cat - ${pdbid}.cif |\ awk 'NR==1{for(i=1;i<=NF;++i)++bad[$i];next} ! bad[NR-1]{print}' |\ cat >! new.cif mv new.cif ${pdbid}.cif goto again endif set BAD = "unknown problem" goto exit endif # sanitize with cad echo labin file 1 all | cad hklin1 ${pdbid}.mtz hklout sane.mtz > /dev/null mv sane.mtz ${pdbid}.mtz # check for structure factors? mtzdmp ${pdbid}.mtz | tee mtzdmp.log |\ awk '/OVERALL FILE STATISTICS/,/No. of reflections used/' |\ awk 'NF>8 && $(NF-1) ~ /[DFGHIJLPQW]/' |\ tee columns.txt |\ awk '{++n}\ $(NF-1) ~ /^[FJGK]$/ || ($(NF-1) == "R" && $NF ~ /^[IF]/){\ ++ds;line[ds]=n;t[ds]=$(NF-1);I[ds]=$NF; meanI[ds]=$(NF-4)} \ $(NF-1) ~ /^[QLM]$/ || $NF ~ /^SIG/{\ S=$NF; for(ds in I){\ if(S=="SIG"I[ds] || S=="SIG"substr(I[ds],2) || n==line[ds]+1){\ SNR=0;sig=$(NF-4)+0; if(sig) SNR = meanI[ds]/sig;\ reso=$(NF-2);comp=substr($0,32)+0\ print I[ds], S, t[ds],reso, comp, SNR;}}}' |\ sort -k3n,4 -k4nr,5 -k5nr |\ awk -v ID=$pdbid '{++seen[$1]} /[\(\\+\-]/ && seen[$1]{next} {print ID,$0}' |\ tee datasets.txt # format: I SIGI ctyp reso completeness signal/noise set test = `cat datasets.txt | wc -l` if("$test" == "0") then cat columns.txt |\ awk '{++n}\ $(NF-1) ~ /^[FJGK]$/ || ($(NF-1) == "R" && $NF ~ /^[IF]/){\ print $NF, "x", $(NF-1),$(NF-2), substr($0,32)+0, "x";}' |\ sort -k3n,4 -k4nr,5 -k5nr |\ awk -v ID=$pdbid '{++seen[$1]} /[\(\\+\-]/ && seen[$1]{next} {print ID,$0}' |\ tee datasets.txt endif cat mtzdmp.log |\ awk '/OVERALL FILE STATISTICS/,/No. of reflections used/' |\ awk 'NF>10 && $(NF-1) ~ /[I]/' |\ awk -v ID=$pdbid '{++n}\ $(NF-1) ~ /^[I]$/ || tolower($NF) ~ /free/{\ mean=$(NF-4);reso=$(NF-2);comp=substr($0,32)+0\ print ID, $NF ,reso, comp, mean;}' |\ tee freestuff.txt set F = `awk '$4=="F"{print $2,$3}' datasets.txt` if("$F" == "" && ! $?NO_TRUNCATE) then set I = `awk '$4=="J"{print $2,$3}' datasets.txt` echo "truncating $I to FP" truncate hklin ${pdbid}.mtz hklout truncated.mtz << EOF >&! ${pdbid}_truncate.log truncate yes labin IMEAN=$I[1] SIGIMEAN=$I[2] labout F=FP SIGF=SIGFP EOF cad hklin1 ${pdbid}.mtz hklin2 truncated.mtz hklout new.mtz << EOF >> /dev/null labin file 1 all labin file 2 E1=FP E2=SIGFP EOF mv new.mtz ${pdbid}.mtz rm -f truncated.mtz endif set sigF = `awk '$4=="F"{print $3}' datasets.txt` if("$sigF" == "x") then echo "no sigma! adding one..." set F = `awk '$4=="F"{print $2}' datasets.txt` rm -f new.mtz sftools << EOF > /dev/null read ${pdbid}.mtz calc Q col SIG$F = 0.1 write new.mtz y exit y EOF if(-s new.mtz) then mv new.mtz ${pdbid}.mtz else echo "failed... " endif endif set free = `awk '$NF+0<1 && $NF+0>0{print $2;exit}' freestuff.txt` if("$free" == "" && ! $?NO_FREE) then echo "adding FreeR_flag" uniqueify ${pdbid}.mtz >& /dev/null mv ${pdbid}-unique.mtz ${pdbid}.mtz endif rm -f mtzdmp.log columns.txt datasets.txt freestuff.txt exit: if($?BAD) then echo "ERROR: $pdbid $BAD" if(! $?allBAD) set allBAD set allBAD = "$allBAD $BAD" unset BAD continue endif echo "OK ${pdbid}" end if($?allBAD) then echo "ERROR: $allBAD" exit 9 endif exit ####################################################################################### # # notes and tests... # foreach oddball ( `awk '{print NR}' ~jamesh/pdb/snapshot/oddball_SGs.txt` ) set pdbSG = `awk -v line=$oddball 'NR==line' ~jamesh/pdb/snapshot/oddball_SGs.txt` grep "$pdbSG" ~jamesh/pdb/snapshot/CRYST1.txt |\ awk '{print $1, "SELECT"}' |\ cat - ~jamesh/pdb/snapshot/all_exp_data.txt |\ awk '{$1=tolower($1)} /SELECT/{++selected[$1];next} selected[$1]{print}' |\ head -10 >! tempfile.txt set pdbids = `cat tempfile.txt` echo "$pdbSG --> $pdbids" | tee -a example_oddballs.txt if($#pdbids > 0) then rm -f ${pdbids[1]}.mtz getcif.com $pdbids[1] endif end cp ~jamesh/pdb/snapshot/all_exp_data.txt pdb_list.txt rm -f getcif_all.log foreach pdbid ( `awk '{print tolower($1)}' pdb_list.txt` ) rm ${pdbid}.mtz rm -f ${pdbid}.cif rm -f ${pdbid}_orig_cif2mtz.log echo "TRYING $pdbid" | tee -a getcif_all.log getcif.com $pdbid | tee -a getcif_all.log end ls -1rt *_orig.cif | awk -F "_" '{print $1}' | tee baddies.txt