First you need to run clstr2txt
(it comes with cd-hit). E.g.:
clstr2txt.pl BindingDBTargetSequences_100.fa.clstr > BindingDBTargetSequences_100_clusters.txt
Then, run the python code below, but change the variables (input and output file names):
import pandas as pd
from pandas import read_csv
clusters = read_csv('goa_swissprot_2020_100.fasta.clstr.txt', sep="\t")
single_clusters = clusters[clusters['clstr_size'] == 1]
multi_clusters = clusters[clusters['clstr_size'] != 1]
single_clusters['representitive_id'] = single_clusters['id']
def assign_cluster_representitive(cluster):
representative = cluster[cluster['clstr_rep'] == 1]
multi_clusters.loc[cluster.index,'representitive_id'] = representative['id'].values[0]
for cluster_name, cluster in multi_clusters.groupby('clstr'):
assign_cluster_representitive(cluster)
clusters = pd.concat([single_clusters, multi_clusters])
clusters.to_csv('goa_swissprot_2020_100_with_representatives.clstr.txt', index=False)