/* * gen_sequence.c program for generating random sequences of * amino acids that have lengths and compositions * similar to sequences in real protein databases. * Copyright (C) 2000 Kevin Karplus * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * See http://www.gnu.org/copyleft/lesser.html for the license details, * or write to the Free Software Foundation, Inc., 59 Temple Place, Suite * 330, Boston, MA 02111-1307 USA */ /* gen_sequence.c * Kevin Karplus 7 Nov 2000 * * This program generates random amino acid sequences. * It takes one command-line argument, the number of sequences to * generate, and puts them out in FASTA format to standard out. * * The length of each sequence is taken from a discretized log-normal * distribution that was fit to the sequences in RSDB-60 [rsdb]. * * The amino acids of a sequence are generated by an independent, * identically distributed process. The probabilities for that * distribution are selected from a mixture of Dirichlet densities. * The mixture of Dirichlet densities was also trained on RSDB-60. * The particular mixture chosen here is not the best fit, but a * compromise between the number of components and the fit. * * A slightly more sophisticated generator (that accepts parameters for * different lognormal distributions and different Dirichlet mixtures * for the composition) will be included in a future release of the SAM * software suite. * * *[rsdb] Park J, Holm L, Heger A, Chothia C * RSDB: representative protein sequence databases have high * information content * Bioinformatics 2000 May;16(5):458-64 * ftp://ftp.ebi.ac.uk/pub/contrib/jong/RSDB/Ver_1999_8_30/ */ #include "gen_norm.h" #include "gen_dirch_mix.h" #include #include #include #include /* pick which underlying uniform random number generator to use. */ #define DRAND() ((random()+0.5)/ 2147483648.0) #define mean_log_length 5.4151 #define stddev_log_length 1.03632564 const char *AA= "ACDEFGHIKLMNPQRSTVWY"; double mix_coeff[6] = {0.272155, 0.259417, 0.170403, 0.139492, 0.0813267, 0.0772063}; double comp0[21]={608.507, 44.1267, 8.54778, 34.3287, 45.012, 25.4885, 40.1706, 13.8165, 37.9695, 40.2287, 57.0279, 15.7816, 24.6325, 26.6716, 21.0461, 33.2734, 39.015, 31.1269, 43.4722, 6.95188, 19.8187}; double comp1[21]={184.442, 12.9502, 3.56352, 9.54209, 10.7789, 7.22886, 11.7739, 4.45173, 9.25549, 10.113, 16.1048, 4.26184, 8.89566, 10.1378, 8.39929, 9.83732, 16.2176, 11.391, 11.4088, 2.43706, 5.69262}; double comp2[21]={368.48, 17.2777, 5.02666, 20.633, 27.1002, 18.6022, 16.5669, 7.46735, 29.459, 33.201, 36.5843, 8.33165, 23.2366, 12.2568, 12.8126, 15.8195, 26.7285, 18.5344, 20.7154, 3.5349, 14.591}; double comp3[21]={300.683, 36.82, 3.69397, 16.532, 17.1343, 9.09368, 25.9803, 7.44929, 11.6098, 7.16141, 31.5673, 6.2929, 6.50351, 17.932, 9.98346, 24.4647, 17.2535, 16.1794, 23.996, 4.379, 6.65627}; double comp4[21]={30.4362, 2.4706, 0.588751, 1.33572, 1.65825, 1.08549, 2.23149, 0.722458, 1.4641, 1.70528, 2.49194, 0.900071, 1.33377, 1.84355, 1.25755, 1.65407, 2.63801, 1.88732, 1.84009, 0.399507, 0.92818}; double comp5[21]={329.148, 25.0801, 4.5497, 8.56496, 10.2153, 24.5674, 21.9557, 5.42381, 29.6721, 13.1884, 43.4575, 10.6261, 11.4957, 13.0346, 8.10884, 11.8077, 25.3051, 18.1739, 25.4028, 5.88753, 12.6309}; double *comps[6] = {comp0, comp1, comp2, comp3, comp4, comp5}; int main(int argc, const char **argv) { int count=1; int length; /* length of seqence */ double probs[20]; /* probability mass function for AAs */ double cum_probs[20]; /* probability distribution * (cumulative sum of probs) */ int s; /* counter for sequences */ int p; /* counter for position in sequence */ int a; /* counter for amino acid number */ int alo, ahi, amid; /* variables for binary search to get amino acid */ double sum; /* temporary for computing cum_probs */ double x; /* uniform random number */ gen_dirch_mix_param comp_gen; /* composition generator */ if (argc!=2) { fprintf(stderr,"gen_sequence requires exactly one argument,\n"); fprintf(stderr,"the number of random sequences to generate.\n"); } else count=atoi(argv[1]); srandom(getpid()); gen_dirch_mix_initialize(&comp_gen, 20, 6, mix_coeff, (const double **)comps); for (s=0; srseq_%06d\n", s); length = (int) exp(mean_log_length +stddev_log_length*gen_norm()); gen_dirch_mix(&comp_gen, probs); sum = 0.; /* convert probs to cumulative probabilities */ for (a=0; a<20; a++) { sum += probs[a]; cum_probs[a] =sum; } for (p=0; p< length; p++) { x = DRAND(); /* do binary search to determine amino acid */ alo = -1; ahi=19; while (alo cum_probs[amid]) alo=amid; else ahi=amid; } putchar(AA[ahi]); if (p%50 == 49) putchar('\n'); } putchar('\n'); } return 0; }