
    sg%H                        d Z ddlZddlZddlmZ ddlmZ ddlZddlZddl	m
Z ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ  ej>                           ej@                  e!      Z"g dZ#ejH                  jJ                  ejH                  jL                  ejH                  jN                  ejH                  jP                  ejH                  jR                  ejH                  jT                  ejH                  jV                  ejH                  jX                  ejH                  jZ                  ejH                  j\                  ejH                  j^                  ejH                  j`                  edZ1 e2d      Z3e3dgz   Z4e4g dz   Z5d Z6d Z7de8de8de9de8de8f
dZ:e!dk(  r ejv                         Z<e<j{                  de8dd       e<j{                  ddd !       e<j{                  d"de8dd#$       e<j{                  d%e8d&'       e<j{                  d(e8d)'       e<j}                         Z? e:e?j                  e?j                  e?j                  e?j                  e?j                         yy)*zConvert ESM checkpoint.    N)Path)TemporaryDirectory)batch_encode_sequences)
esmfold_v1)	EsmConfigEsmFoldConfig)EsmForMaskedLMEsmForSequenceClassificationEsmIntermediateEsmLayer	EsmOutputEsmSelfAttentionEsmSelfOutput)EsmForProteinFolding)EsmTokenizer)logging))protein1\  MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA)protein2?MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA)protein3zPMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG)protein4zNMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA)esm1b_t33_650M_UR50Sesm1v_t33_650M_UR90S_1esm1v_t33_650M_UR90S_2esm1v_t33_650M_UR90S_3esm1v_t33_650M_UR90S_4esm1v_t33_650M_UR90S_5esm2_t48_15B_UR50Desm2_t36_3B_UR50Desm2_t33_650M_UR50Desm2_t30_150M_UR50Desm2_t12_35M_UR50Desm2_t6_8M_UR50Dr   ARNDCQEGHILKMFPSTWYVX)z<pad>z<mask>z<cls>z<sep>z<eos>c                      t               5 } dj                  t              }t        |       dz  }|j	                  |       t        t        |            }d d d        d_        |S # 1 sw Y   xY w)N
	vocab.txt
vocab_filer   )r   joinrestypes_with_extrasr   
write_textr   strpad_token_id)tempdirvocabr+   hf_tokenizers       V/var/www/html/venv/lib/python3.12/site-packages/transformers/models/esm/convert_esm.pyget_esmfold_tokenizerr5   M   sl    		 @		./'][0
e$#s:?	@
 !"L@ @s   A
A&&A/c                     |j                  | j                               }|j                  rt        d|j                         |j                  rt        d|j                         y )NzMissing keys: zUnexpected keys: )load_state_dict
state_dictmissing_keys
ValueErrorunexpected_keys)original_module
our_modulestatuss      r4   transfer_and_check_weightsr?   W   se    ''(B(B(DEF>&*=*=)>?@@,V-C-C,DEFF     modelpytorch_dump_folder_pathclassification_headpush_to_repo
auth_tokenc           	      %   | j                  d      rt        |           }nt        |           \  }}|j                          | j                  d      r|j                  j                  }|j                  j
                  }|j                  j                  }	d|z  }
|j                  j                  }d}d}d}t               }|j                  j                         D ]%  \  }}t        ||      s|dk7  st        |||       ' |j                  j                  j                         D ]9  \  }}t        |j                  |      s|dk7  s#t        |j                  ||       ; |j                  j                  j                  j                         D ]G  \  }}t        |j                  j                  |      s't        |j                  j                  ||       I nt        |d      r|j                  j                  }|j                  j                   }|j                  j                  }	|j                  j"                  }
|j                  j                  }|j$                  rdnd}d	}d}d
}n=|j                  }|j
                  }|j                  }	d|z  }
|j                  }d}d}d}d
}|r|j                  j&                  }t)        j*                        }|j,                  }|j.                  }|r|j                  }n|}t1        d@i d|j2                  j4                  d|d|d|d|	d|
ddddddddd|d|d|d|d|d|d|}|r5|j6                  d   j8                  j:                  j<                  d    |_        tA        d!|       | j                  d      rtB        }n|rtD        }ntF        } ||      } | j                          |j2                  j:                  | j                  jH                  jJ                  _        |d	k(  r9|jL                  j:                  | j                  jH                  jN                  _        |j$                  rr|j$                  j:                  | j                  jH                  jP                  _        |j$                  jR                  | j                  jH                  jP                  _)        |jT                  j:                  | j                  jV                  jT                  _        |jT                  jR                  | j                  jV                  jT                  _)        tY        |jZ                        D ]a  }| j                  jV                  j\                  |   }|j                   |   }|j^                  j`                  }|jb                  jd                  j:                  jf                  j<                  |jb                  jh                  j:                  jf                  j<                  cxk(  rg|jb                  jj                  j:                  jf                  j<                  cxk(  r/tm        jn                  |jp                  |jp                  f      k(  sJ  J |jb                  jh                  j:                  |jr                  j:                  _3        |jb                  jh                  jR                  |jr                  jR                  _3        |jb                  jd                  j:                  |jt                  j:                  _3        |jb                  jd                  jR                  |jt                  jR                  _3        |jb                  jj                  j:                  |jv                  j:                  _3        |jb                  jj                  jR                  |jv                  jR                  _3        ty        |jb                  d"d
      9|jb                  jz                  j|                  |j~                  j|                  _3        |j                  j:                  |j^                  j                  _        |j                  jR                  |j^                  j                  _)        |j                  j:                  |j                  _        |j                  jR                  |j                  _)        |j^                  j                  }|j                  j:                  j<                  |jb                  j8                  j:                  j<                  k(  sJ |jb                  j8                  j:                  |j                  _        |jb                  j8                  jR                  |j                  _)        |j                  }|j                  j:                  j<                  |j                  j:                  j<                  k(  sJ |j                  j:                  |j                  _        |j                  jR                  |j                  _)        |j                  }|j                  j:                  j<                  |j                  j:                  j<                  k(  sJ |j                  j:                  |j                  _        |j                  jR                  |j                  _)        d |r,|j                  jf                  | j                  _3        |j                  jf                  | j                  _3        t        |j                  | j                         t        |j                  | j                         t        |j                  | j                         t        |j                  | j                         t        |j                  | j                         t        |j                  | j                         t        |j                  | j                         n6|r|j                  j6                  d   j                  j:                  | j                  j                  _        |j6                  d   j                  jR                  | j                  j                  _)        |j6                  d   j8                  j:                  | j                  j8                  _        |j6                  d   j8                  jR                  | j                  j8                  _)        n8|j                  j                  j:                  | j                  j                  _        |j                  j                  jR                  | j                  j                  _)        |j                  jP                  j:                  | j                  jP                  _        |j                  jP                  jR                  | j                  jP                  _)        |j                  j:                  | j                  j                  _        |j                  jR                  | j                  _)        t        |j                  | j                  j                         |r
t        d
d# }nt        }|rt               }  | |D !cg c]  }!|!d$   	 c}!d%dd&      }"t        |D !cg c]  }!|!d$   	 c}!      \  }#}$}%}%}%tm        j                  |"d'   |#k(        xr tm        j                  |"d(   |$k(        }&n|j                         }' |'|      \  }(})}*t               5 }+d)j                  |j*                        },t        |+      d*z  }-|-j                  |,       t        t        |-      +      } d
d
d
         |D !cg c]  }!|!d$   	 c}!d%d,      }"tm        j                  |"d'   |*k(        }&tA        d-|&rd.nd/       |&st        d0      tm        j                         5  |rk|j                         j                  |D !cg c]  }!|!d$   	 c}!      }. | j                         |"d'   j                         |"d(   j                         1      }/nf | d@i |"d2di}/|/d3   }/|r/ |j                  j6                  d   |j                  *            }.n$ ||"d'   t        tY        d4            5      }.|.d3   }.|r^tm        j                  tm        j                  |/d6   |.d6   z
              j                         }0tm        j                  |/d6   |.d6   d7      }&nQtm        j                  tm        j                  |/|.z
              j                         }0tm        j                  |/|.d7      }&tA        d8|0        tA        d9|&rd.nd/       |&st        d:      |s| j                  |"d'   |"d(         }/|j                  |"d'         }.tm        j                  tm        j                  |/|.z
              j                         }0tm        j                  |/|.d7      }&tA        d;       tA        d8|0        tA        d9|&rd.nd/       |&st        d:      t        j                  |      j                  dd<       tA        d=|        | j                  |       ~d
d
d
       tA        d>|        | j                  |       |r'| j                  ||?       | j                  ||?       y
y
c c}!w c c}!w # 1 sw Y   xxY wc c}!w c c}!w # 1 sw Y   sxY w)Az?
    Copy/paste/tweak esm's weights to our BERT structure.
    esmfold   FrotaryTtrunkstructure_moduleargsabsoluteN
vocab_sizemask_token_idhidden_sizenum_hidden_layersnum_attention_headsintermediate_sizemax_position_embeddingsi  layer_norm_epsgh㈵>attention_probs_dropout_probg        hidden_dropout_probr0   emb_layer_norm_beforetoken_dropoutposition_embedding_typeis_folding_modelesmfold_config
vocab_listmnlir   zOur ESM config:rot_emb      pt)return_tensorspaddingadd_special_tokens	input_idsattention_maskr(   r)   r*   )rc   rd   z1Do both models tokenizers output the same tokens?u   🔥u   💩zTokenization does not match!)rf   rg   output_hidden_stateslogitsi  )repr_layers	positions)atolzmax_absolute_diff = z'Do both models output the same tensors?zSomething went wRoNgzContact prediction testing:)parentsexist_okzSaving model to zSaving tokenizer to )repo_idtoken_token )o
startswithMODEL_MAPPINGevalesm	embed_dim
num_layersattention_headsrY   r   cfgitemshasattrsetattrrJ   rK   rL   layersffn_embed_dimrX   alphabettupleall_toksmask_idxpadding_idxr   embed_tokensnum_embeddingsclassification_headsout_projweightshape
num_labelsprintr   r
   r	   
embeddingsword_embeddingsembed_positionsposition_embeddings
layer_normbiasemb_layer_norm_afterencoderrangerQ   layer	attentionself	self_attnk_projdataq_projv_projtorchSizerP   querykeyvaluegetattrr_   inv_freqrotary_embeddingsself_attn_layer_norm	LayerNormfinal_layer_normoutputdenseintermediatefc1fc2esm_s_combine
af2_to_esmr?   	embedding	esm_s_mlpdistogram_headptm_headlm_head	lddt_head
classifierdecodercontact_headSAMPLE_DATAr5   esmfold_encode_sequencesallget_batch_converterr   r,   r   r.   r   r/   	Exceptionno_gradcudainferrA   extract_featureslistmaxabsitemallclosepredict_contactspathlibmkdirsave_pretrainedpush_to_hub)1rA   rB   rC   rD   rE   ru   r   rv   rw   rR   rS   rY   rX   rZ   r[   r\   r   valr]   rO   r0   original_esm_modelconfigmodel_classir   	esm_layerr   self_outputr   bert_outputsample_datar3   row	hf_tokensesmfold_aasesmfold_mask_successbatch_converterbatch_labels
batch_strsbatch_tokensr1   r2   r+   their_output
our_outputmax_absolute_diffs1                                                    r4   !convert_esm_checkpoint_to_pytorchr   _   sJ    	"E"$%e,.XHHJ	"GG%%	WW''
!gg55	M-- %"*& 	2HC~s+wS1	2 ++- 	8HC~++S1c=O6O,,c37	8 66<<> 	IHC~++<<cB,,==sCH	I 
f	HH&&	XX__
!hh66HH22..(+(A(Au",  MM	^^
!11	M)) %"* 77##x(()J%%M''L WW  %22AA#  %	
 0 , !%  &)   " 4 $ !8 *  &!" #F& 44V<EELLRRSTU	
V$	"*	2$E	JJL 3E2Q2Q2X2XEII((/*,:L:\:\:c:c		007##1C1Y1Y1`1`		''./A/W/W/\/\		'',4F4[4[4b4bEII**12D2Y2Y2^2^EII**/6++, 54))++11!4&--a0	 ',oo&:&:	&&--2288""))0055;;D""))0055;;D zz6--v/A/ABCD	
D	
D '0&9&9&@&@&G&G	#$-$7$7$>$>$C$C	!$-$7$7$>$>$E$E	!"+"5"5"<"<"A"A	&/&9&9&@&@&G&G	#$-$7$7$>$>$C$C	!9&&	48D 9B8K8K8S8S8\8\I''005 ,5+I+I+P+P!!()2)G)G)L)L!!&!*!;!;!B!B(99>> &+__%;%;  ''--1D1D1M1M1T1T1Z1ZZZZ#,#6#6#?#?#F#F !*!4!4!=!=!B!B ).(:(:!!((..)--2F2F2L2LLLL$-MM$8$8!"+--"4"4 "'  ''--1E1E1K1KKKK#,==#7#7 !*!3!3k54p #&#4#4#9#9  # 3 3"3==%//B"3==%//B"399ekk:"3#5#5u7K7KL"3<<@"3;;>"3==%//B	(+(D(DV(L(R(R(Y(Y%&)&>&>v&F&L&L&Q&Q#+.+C+CF+K+T+T+[+[!!(),)A)A&)I)R)R)W)W!!& &)[[%6%6%=%="#&;;#4#4#9#9 *-++*@*@*G*G  '(+(>(>(C(C  %'*{{'9'9$ [[-- s//1G1GH !"1o!,. *+SV+D$ch
	 .FYdFeRUs1vFe-f*\1a))Ik2kAB 
uyy&'<7H

 #6681@1M.j,! 	DWIIh//0Eg4J!!%('3z?CL		D !K!@S#a&!@QU_cd	))Ik2lBC	
=vV\]677	 3 88:++{,KSV,KLL%#K0557	RbHcHhHhHjJ FFFJ#H-J"Esyy==fEcFZFZ[gFhi"9[#9tERUJGWX+H5 %		%))J{4Kl[fNg4g*h i n n pnnZ%<l;>W^bcG %		%))J4M*N O T T VnnZDIG$%6$78977PVW233//	+0F	RbHcdJ//	+0FGL %		%))J4M*N O T T VnnZDIG/0():(;<=;wVTZ[ 677-.44TD4Q !9 :;<67g3j 
 !9 :
;<  !9:,JG  : N a ,Fe	D 	D "A -L3 3sK   $AJ8AJ=<AAKAK< AKAK
(I?AKKAKKAKKAK"__main__z--pytorch_dump_folder_pathTz!Path to the output PyTorch model.)typerequiredhelpz--classification_head
store_truez/Whether to convert a final classification head.)actionr   z--modelzName of model to convert.)defaultr   r   r   z--push_to_repoz(Repo to upload to (including username!).)r   r   z--auth_tokenzHuggingFace auth token.)E__doc__argparser   r   tempfiler   ru   
esm_moduler   esm.esmfold.v1.miscr   r   esm.esmfold.v1.pretrainedr   )transformers.models.esm.configuration_esmr   r   $transformers.models.esm.modeling_esmr	   r
   r   r   r   r   r   (transformers.models.esm.modeling_esmfoldr   (transformers.models.esm.tokenization_esmr   transformers.utilsr   set_verbosity_info
get_logger__name__loggerr   
pretrainedr   r   r   r   r   r   r   r    r!   r"   r#   r$   rs   r   restypesrestypes_with_xr-   r5   r?   r/   boolr   ArgumentParserparseradd_argument
parse_argsrL   rA   rB   rC   rD   rE   rq   r@   r4   <module>r      ss       '   R 0 N   J A &    			H	% '11FF(33JJ(33JJ(33JJ(33JJ(33JJ$//BB#..@@%00DD%00DD$//BB"-->>  &'cU"&)WW G^O^O*-^ODH^OX[^Oil^OB	 z$X$$&F
$3Dg   ;l   	4cDOjk
(s9cd
S7PQD%

D1143K3KTM^M^`d`o`o r@   