#! /bin/tcsh -f # # # # extract REMARK 200 from everything find ftp.wwpdb.org/pub/pdb/data/structures/divided/pdb/ -name '*.ent.gz' -print -exec zgrep "REMARK 200" \{\} \; |\ awk '/^ftp./{id=substr($0,length($0)-10,4)}\ /^REMARK 200 /{rem=1;print id,$0}\ id!=lastid && lastid!="" && rem==0{print lastid,"has no REM200"}\ id!=lastid {rem=0}\ {lastid=id}' |\ cat >! REM200.txt # get all information that can be used to determine the method used cat REM200.txt |\ awk -F ":" '\ /METHOD USED TO DETERMINE THE STRUCTURE/{\ id=substr($0,1,4);meth=$NF;\ getline;\ while(substr($0,17,13)!="SOFTWARE USED" && substr($0,17)~/[^ ]/){meth=meth substr($0,17);getline}\ while(gsub(" "," ",meth));\ while(gsub(":"," ",meth));}\ /STARTING MODEL/ && ! /NULL/{meth="MOLECULAR REPLACEMENT " meth}\ /STARTING MODEL/{print id,":",type,":",meth}\ /EXPERIMENT TYPE/{type=$NF}' |\ cat > ! method.txt # reformat it a bit awk -F ":" '{print $1,"isapdb"}' method.txt |\ cat >! stuff awk -F ":" '{print $NF}' method.txt |\ cat >> stuff # try to figure out what it all means cat stuff |\ awk '$NF=="isapdb"{++isapdb[toupper($1)];next}\ /MOLECULAR/ || /REPLACEMENT/ {++count["MR"];next}\ /STRUCTURE/ {++count["MR"];next}\ /NATIVE/ {++count["MR"];next}\ / PDB / {++count["MR"];next}\ / STARTING MODEL / {++count["MR"];next}\ / WILD-TYPE / {++count["MR"];next}\ /COORDINATES/ {++count["MR"];next}\ /DIRECT REFINE/ {++count["MR"];next}\ /RIGID.BODY/ {++count["MR"];next}\ /ALREADY SOLVED/ {++count["MR"];next}\ /[- ]MAD[- ]/ {++count["MAD"];next}\ /[- ]RIP[- ]/ {++count["RIP"];next}\ /SEMAD/ {++count["MAD"];next}\ /HGMAD/ {++count["MAD"];next}\ /SINGLE/ && /ANOM[AO]LOUS/ {++count["SAD"];next}\ /ANOM[AO]LOUS/ {++count["MAD"];next}\ /[- ]SAD[- ]/ {++count["SAD"];next}\ /[- ]SAS[- ]/ {++count["SAD"];next}\ /[- ]SIR[- ]/ {++count["SIR"];next}\ / SINGLE WAVELENGTH / {++count["SAD"];next}\ / ISOMORPHOUS REPLACEMENT / {++count["MIR"];next}\ / MIR / {++count["MIR"];next}\ / M\.I\.R/ {++count["MIR"];next}\ / MULTIPLE ISOMORPHOUS / {++count["MIR"];next}\ / SINGLE ISOMORPHOUS / {++count["SIR"];next}\ / SIRA[SD]/ {++count["SIRAS"];next}\ / SIR[SO\/]A[SD]/ {++count["SIRAS"];next}\ / SIR[\(]AS/ {++count["SIRAS"];next}\ / MIRA[SD]/ {++count["MIRAS"];next}\ / MIR[SO\/]A[SD]/ {++count["MIRAS"];next}\ / MIR[\(]AS/ {++count["MIRAS"];next}\ / HEAVY/ {++count["MIR"];next}\ / SOAKING/ {++count["MIR"];next}\ / SIMULA/{++count["MR"];next}\ /PDB CODE/{++count["MR"];next}\ / WILD/ && /TYPE /{++count["MR"];next}\ / SINGLE HG/ || / ISAS /{++count["SIR"];next}\ /AUTOSOL /{++count["MAD"];next}\ / REFMAC/{++count["MR"];next}\ / MOLREP/{++count["MR"];next}\ / PHASER $/{++count["MR"];next}\ / PHENIX /{++count["MR"];next}\ / COMO /{++count["MR"];next}\ / SOLVE $/ || /AUTOSHARP/{++count["MAD"];next}\ / ARP\/WARP / {++count["MR"];next}\ / RIPAS/ {++count["RIP"];next}\ /SAD/ {++count["SAD"];next}\ /MAD/ {++count["MAD"];next}\ /MIR/ {++count["MIR"];next}\ /SIR/ {++count["SIR"];next}\ /MR/ {++count["MR"];next}\ isapdb[$NF]{++count["MR"];next}\ /REFINEMENT/ {++count["MR"];next}\ /D-FOURI/ || /DIFFERENCE/ {++count["MR"];next}\ /DIRECT/ || /HALF-BAKE/ || /AB INITIO/ || /DUAL-SPACE/ {++count["AI"];next}\ / WAVELENGTH / {++count["MAD"];next}\ / ISMORPHOUS/ || /ISOMORPH/ && ! /REPLA/ || /ISO TO/ {++count["MR"];next}\ / FOURIER SYNTHESIS / || / FOURIER METHODS / {++count["MR"];next}\ / KNOWN / {++count["MR"];next}\ / MANUALLY PLACED / {++count["MR"];next}\ {++count[$0]} END{\ for(meth in count) print count[meth],meth}' |\ sort -g | tee method_count.txt