Go to the documentation of this file.
10 #ifndef _PROTEINORTHOCLUSTERING
11 #define _PROTEINORTHOCLUSTERING
34 #define floatprecision_H 1
49 map<string,floattype> t_master;
53 void debug__print_edgelist (
protein& node,
const unsigned int index,
const int node_id);
56 void debug__conn_integrity(vector<unsigned int>& nodes,
floattype conn);
58 void debug__graph_integrity(vector<unsigned int>& nodes);
119 extern void ssyevr_(
char* jobz,
char* range,
char* uplo,
int* n,
float* a,
120 int* lda,
float* vl,
float* vu,
int* il,
int* iu,
float* abstol,
121 int* m,
float* w,
float* z,
int* ldz,
int* isuppz,
float* work,
122 int* lwork,
int* iwork,
int* liwork,
int* info );
123 extern void dsyevr_(
char* jobz,
char* range,
char* uplo,
int* n,
double* a,
124 int* lda,
double* vl,
double* vu,
int* il,
int* iu,
double* abstol,
125 int* m,
double* w,
double* z,
int* ldz,
int* isuppz,
double* work,
126 int* lwork,
int* iwork,
int* liwork,
int* info );
139 template<
class T>
void dssyevr_(
char* jobz,
char* range,
char* uplo,
int* n, T* a,
140 int* lda, T* vl, T* vu,
int* il,
int* iu, T* abstol,
141 int* m, T* w, T* z,
int* ldz,
int* isuppz, T* work,
142 int* lwork,
int* iwork,
int* liwork,
int* info ){
143 #if floatprecision_H == 1
144 ssyevr_(jobz, range, uplo, n, a,
145 lda, vl, vu, il, iu, abstol,
146 m, w, z, ldz, isuppz, work,
147 lwork, iwork, liwork, info );
148 #elif floatprecision_H == 2
149 dsyevr_(jobz, range, uplo, n, a,
150 lda, vl, vu, il, iu, abstol,
151 m, w, z, ldz, isuppz, work,
152 lwork, iwork, liwork, info );
154 cerr << string(
"Error: invalid floattype (should be either float=1 or double=2)!").c_str() << endl;
throw;
189 if(i > m_content_CC.size()){cerr <<
"[CRITICAL ERROR] out of bound in ConnectedComponent[]" << endl;
throw;}
190 return m_content_CC[i];
195 if(i > m_content_CC.size()){cerr <<
"[CRITICAL ERROR] out of bound in ConnectedComponent[]" << endl;
throw;}
196 return m_content_CC[i];
201 unsigned int size(){
return m_content_CC.size();}
202 unsigned int size()
const {
return m_content_CC.size();}
214 m_content_CC.push_back(i);
246 inline bool operator() (
const pair<string,int> & p1,
const pair<string,int>& p2)
248 return (p1.second > p2.second);
269 size_t getPeakRSS( );
294 unsigned int BFS_not_critical( vector<unsigned int> * done_withBacktrack,
unsigned int start_node,
unsigned int end_node );
374 void tokenize(
const string& str, vector<string>& tokens,
const string& delimiters =
"\t");
381 unsigned int max_of_diag(vector<unsigned int>& nodes, vector<unsigned int>& diag);
389 vector<floattype>
get_new_x(vector<floattype> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int> &mapping,
bool isWeighted);
401 vector<floattype>
getY(
floattype max_degree, vector<floattype> x_hat, vector<floattype> x_new, vector<unsigned int>& nodes, vector<unsigned int>& diag);
405 unsigned int sumOutDegs(
const vector<unsigned int>& nodes);
423 void splitGroups(vector<floattype>& y, vector<unsigned int>& nodes ,
bool useLapack);
void ssyevr_(char *jobz, char *range, char *uplo, int *n, float *a, int *lda, float *vl, float *vu, int *il, int *iu, float *abstol, int *m, float *w, float *z, int *ldz, int *isuppz, float *work, int *lwork, int *iwork, int *liwork, int *info)
bool criticalHeuristic(ConnectedComponent *cur_cc)
void push_back(unsigned int i)
Definition: proteinortho_clustering.h:213
unsigned short weight
the weight of this given edge
Definition: proteinortho_clustering.h:222
string full_name
the full input name of this protein
Definition: proteinortho_clustering.h:230
floattype getConnectivity(vector< unsigned int > &nodes, bool useLapack)
void splitGroups(vector< floattype > &y, vector< unsigned int > &nodes, bool useLapack)
ConnectedComponent()
Definition: proteinortho_clustering.h:181
unsigned int BFS_not_critical(vector< unsigned int > *done_withBacktrack, unsigned int start_node, unsigned int end_node)
void dsyevr_(char *jobz, char *range, char *uplo, int *n, double *a, int *lda, double *vl, double *vu, int *il, int *iu, double *abstol, int *m, double *w, double *z, int *ldz, int *isuppz, double *work, int *lwork, int *iwork, int *liwork, int *info)
double density
the graph density calculated in the function BFS (at the end)
Definition: proteinortho_clustering.h:177
const unsigned int & operator[](unsigned int i) const
Definition: proteinortho_clustering.h:194
vector< floattype > get_new_x(vector< floattype > x, vector< unsigned int > &nodes, map< unsigned int, unsigned int > &mapping, bool isWeighted)
bool param_verbose
By default no verbose is printed.
Definition: proteinortho_clustering.h:70
floattype param_sep_purity
as a reference: a-b-c will give +/-0.707107 and 2.34857e-08
Definition: proteinortho_clustering.h:73
vector< int > reorder_table
Tells how proteins/species must be sorted.
Definition: proteinortho_clustering.h:102
ConnectedComponent BFS(vector< bool > *done, unsigned int cur_node) size_t getPeakRSS()
bool param_useLapack
If true, then lapack is used accordingly to param_lapack_power_threshold_d.
Definition: proteinortho_clustering.h:81
Definition: proteinortho_clustering.h:235
unsigned int critical_min_nodes
Depricated.
Definition: proteinortho_clustering.h:84
unsigned int numberOfNodesToMemoryUsageLaplacian_inKB(unsigned int n)
void removeExternalEdges(map< unsigned int, bool > &a)
vector< wedge > edges
all the adjacent nodes along with weights -> wedge
Definition: proteinortho_clustering.h:228
unsigned int num_cpus
By default only one core is used (change this with -cpus)
Definition: proteinortho_clustering.h:104
unsigned int param_max_nodes
= 2^24. The absolute maximal number of nodes for a connected component. If exceeded then it is skippe...
Definition: proteinortho_clustering.h:74
float floattype
Definition: proteinortho_clustering.h:33
floattype param_lapack_power_threshold_d
The minimum graph density for the power iteration method, lapacks (d|s)syevr is used otherwise....
Definition: proteinortho_clustering.h:92
const unsigned int maxUseWeightsNumNodes
= 2^20. The maximum number of nodes for a connected component such that weights are still used for ca...
Definition: proteinortho_clustering.h:91
unsigned int species_counter
Number of species (in the input graph)
Definition: proteinortho_clustering.h:95
bool param_useKmereHeuristic
If true then kmere-heuristic is used. For ambigous splits (e.g. all but one entrie of the fiedler vec...
Definition: proteinortho_clustering.h:79
Definition: proteinortho_clustering.h:244
void print_group(vector< unsigned int > &nodes, floattype connectivity)
void remove_edge_index(const unsigned int node_id, const unsigned int index)
unsigned int debug_level
Debug stderr level.
Definition: proteinortho_clustering.h:72
floattype param_min_species
Minimum number of species per connected component.
Definition: proteinortho_clustering.h:75
unsigned int param_max_iter
Maximal number of iterations for power iteration.
Definition: proteinortho_clustering.h:86
unsigned int edge
the adjacent node index of variable graph
Definition: proteinortho_clustering.h:221
vector< shared_ptr< ofstream > > graph_clean
File handler to store graph data.
Definition: proteinortho_clustering.h:101
unsigned int max_of_diag(vector< unsigned int > &nodes, vector< unsigned int > &diag)
vector< floattype > normalize(vector< floattype > x, floattype *length)
unsigned long graph_ram_total_inKB
The internal size of the input graph in KB (this will be set in parse_file() )
Definition: proteinortho_clustering.h:103
vector< floattype > generate_random_vector(const unsigned int size)
void clear_edges(vector< unsigned int > &nodes)
string param_rmgraph
Name of the remove graph, containing all edges that are removed by this clustering algorithm.
Definition: proteinortho_clustering.h:76
const unsigned int kmereHeuristic_minNumberOfGroups
The minimum number of species for the kmere-heuristic.
Definition: proteinortho_clustering.h:90
floattype param_epsilon
Epsilon for lapack functions dsyevr/ssyevr as well as the power iteration. Set analog to http://peopl...
Definition: proteinortho_clustering.h:87
unsigned int edges
Total number of edges.
Definition: proteinortho_clustering.h:100
void tokenize(const string &str, vector< string > &tokens, const string &delimiters="\t")
bool comparator_pairfloattypeUInt(const pair< floattype, unsigned int > &l, const pair< floattype, unsigned int > &r)
bool test__generate_random_vector()
Definition: proteinortho_clustering.h:220
void stats(floattype i, floattype size)
unsigned int protein_counter
Number of proteins total (in the input graph)
Definition: proteinortho_clustering.h:96
unsigned int species_id
the species id of this protein
Definition: proteinortho_clustering.h:229
const unsigned int min_iter
Minimal number of iterations for power iteration.
Definition: proteinortho_clustering.h:85
const unsigned int kmereHeuristic_protPerSpecies
The mimum number of proteins per species for the kmere-heuristic.
Definition: proteinortho_clustering.h:89
Definition: proteinortho_clustering.h:172
const unsigned int kmereHeuristic_minNodes
= 2^20. The minimum number of nodes of the connected component for the kmere-heuristic.
Definition: proteinortho_clustering.h:88
vector< unsigned int > m_content_CC
ids of the induced subgraph of the global variable graph. E.g. m_content_CC=[0,4,6] -> proteins graph...
Definition: proteinortho_clustering.h:175
Definition: proteinortho_clustering.h:227
vector< protein > graph
Graph containing all protein data (see the class protein for more informations)
Definition: proteinortho_clustering.h:98
unsigned int size() const
Definition: proteinortho_clustering.h:202
floattype calc_group(vector< unsigned int > &nodes)
floattype last_stat
For buffering progress stats.
Definition: proteinortho_clustering.h:99
unsigned int param_minOpenmp
the minimum size of a for-loop for openmp to activate (openmp has some initialization costs)
Definition: proteinortho_clustering.h:78
bool param_useWeights
If true, then weights are used for the calculation of the algebraic connectivity.
Definition: proteinortho_clustering.h:77
unsigned int & operator[](unsigned int i)
Definition: proteinortho_clustering.h:188
unsigned int size()
Definition: proteinortho_clustering.h:201
unsigned int d_sum
sum of node degrees
Definition: proteinortho_clustering.h:176
vector< floattype > makeOrthogonal(vector< floattype > x)
floattype string2floattype(string str)
floattype param_con_threshold
as a reference: a chain a-b-c-d has 0.25
Definition: proteinortho_clustering.h:71
bool test__makeOrthogonal()
vector< floattype > getY(floattype max_degree, vector< floattype > x_hat, vector< floattype > x_new, vector< unsigned int > &nodes, vector< unsigned int > &diag)
unsigned int sumOutDegs(const vector< unsigned int > &nodes)
void parse_file(string file)
void dssyevr_(char *jobz, char *range, char *uplo, int *n, T *a, int *lda, T *vl, T *vu, int *il, int *iu, T *abstol, int *m, T *w, T *z, int *ldz, int *isuppz, T *work, int *lwork, int *iwork, int *liwork, int *info)
Definition: proteinortho_clustering.h:139
vector< string > species
Species ID (number) -> Species name.
Definition: proteinortho_clustering.h:97
unsigned int param_maxRam_inKB
= 16 GB of memory as default. Decreasing this may lead to more power iterations and thus to a increas...
Definition: proteinortho_clustering.h:80