proteinortho  6.0.14
proteinortho_clustering.h
Go to the documentation of this file.
1 /*
2  * Clustering algorithm for Proteinortho
3  * Reads edge list and splits connected components
4  * according to algebraic connectivity threshold
5  *
6  * Last updated: 2018/03/01
7  * Author: Marcus Lechner
8  */
9 
10 #ifndef _PROTEINORTHOCLUSTERING
11 #define _PROTEINORTHOCLUSTERING
12 
13 #include <iostream>
14 #include <fstream>
15 #include <sstream>
16 #include <string>
17 #include <list> //BFS/DFS
18 #include <map>
19 #include <algorithm>
20 #include <cmath>
21 #include <vector>
22 #include <stack>
23 #include <iomanip>
24 #include <cstdlib>
25 #include <ctime>
26 #include <memory>
27 #include <climits> // unsigned int max range
28 
29 #ifdef _OPENMP
30  #include <omp.h>
31 #endif
32 
33 using floattype = float;
34 #define floatprecision_H 1
35 //the floatprecision_H has to be 1 for float and 2 for double
36 
37 using namespace std;
38 
39 
44 //#define DEBUG
45 //#define timeAnalysis //if set : need -std=c++11 for compiling
46 
47  #ifdef timeAnalysis
48  #include <chrono>
49  map<string,floattype> t_master;
50  #endif
51 
52  #ifdef DEBUG
53  void debug__print_edgelist (protein& node, const unsigned int index, const int node_id);
55 
56  void debug__conn_integrity(vector<unsigned int>& nodes, floattype conn);
57 
58  void debug__graph_integrity(vector<unsigned int>& nodes);
59  #endif
60  // end of debug
62 
63 
64 
69  // Parameters (prefix param_*)
70  bool param_verbose = false;
72  unsigned int debug_level = 0;
74  unsigned int param_max_nodes = 16777216;
76  string param_rmgraph = "remove.graph";
77  bool param_useWeights = true;
78  unsigned int param_minOpenmp = 256;
80  unsigned int param_maxRam_inKB = 16777216;
81  bool param_useLapack = true;
82 
83  // min/max number of alg con iterations
84  unsigned int critical_min_nodes = 16777216;
85  const unsigned int min_iter = 16;
86  unsigned int param_max_iter = 8192;
88  const unsigned int kmereHeuristic_minNodes = 1048576;
89  const unsigned int kmereHeuristic_protPerSpecies = 1;
90  const unsigned int kmereHeuristic_minNumberOfGroups = 3;
91  const unsigned int maxUseWeightsNumNodes = 1048576;
93 
94  // Globals
95  unsigned int species_counter = 0;
96  unsigned int protein_counter = 0;
97  vector<string> species;
98  vector<protein> graph;
100  unsigned int edges = 0;
101  vector<shared_ptr<ofstream> > graph_clean;
102  vector<int> reorder_table;
103  unsigned long graph_ram_total_inKB=0;
104  unsigned int num_cpus=1;
105  // end of globalVars
107 
108 
109 
110 
111 
118  extern "C" {
119  extern void ssyevr_( char* jobz, char* range, char* uplo, int* n, float* a,
120  int* lda, float* vl, float* vu, int* il, int* iu, float* abstol,
121  int* m, float* w, float* z, int* ldz, int* isuppz, float* work,
122  int* lwork, int* iwork, int* liwork, int* info );
123  extern void dsyevr_( char* jobz, char* range, char* uplo, int* n, double* a,
124  int* lda, double* vl, double* vu, int* il, int* iu, double* abstol,
125  int* m, double* w, double* z, int* ldz, int* isuppz, double* work,
126  int* lwork, int* iwork, int* liwork, int* info );
127 
128  }
129 
139  template<class T> void dssyevr_( char* jobz, char* range, char* uplo, int* n, T* a,
140  int* lda, T* vl, T* vu, int* il, int* iu, T* abstol,
141  int* m, T* w, T* z, int* ldz, int* isuppz, T* work,
142  int* lwork, int* iwork, int* liwork, int* info ){ // dssyevr_<float>(...) calls the ssyevr_(...) function and dssyevr_<double>(...) calls the dsyevr_(...) function
143  #if floatprecision_H == 1
144  ssyevr_(jobz, range, uplo, n, a,
145  lda, vl, vu, il, iu, abstol,
146  m, w, z, ldz, isuppz, work,
147  lwork, iwork, liwork, info );
148  #elif floatprecision_H == 2
149  dsyevr_(jobz, range, uplo, n, a,
150  lda, vl, vu, il, iu, abstol,
151  m, w, z, ldz, isuppz, work,
152  lwork, iwork, liwork, info );
153  #else
154  cerr << string("Error: invalid floattype (should be either float=1 or double=2)!").c_str() << endl;throw;
155  #endif
156  }
157  // end of lapack
159 
160 
161 
162 
163 
164 
173  {
174  public:
175  vector<unsigned int> m_content_CC;
176  unsigned int d_sum;
177  double density;
178 
182  d_sum=0;
183  density=-1;
184  }
185 
188  unsigned int& operator[](unsigned int i){
189  if(i > m_content_CC.size()){cerr << "[CRITICAL ERROR] out of bound in ConnectedComponent[]" << endl; throw;}
190  return m_content_CC[i];
191  }
194  const unsigned int& operator[](unsigned int i)const{
195  if(i > m_content_CC.size()){cerr << "[CRITICAL ERROR] out of bound in ConnectedComponent[]" << endl; throw;}
196  return m_content_CC[i];
197  }
198 
201  unsigned int size(){ return m_content_CC.size();}
202  unsigned int size()const { return m_content_CC.size();}
203 
206  void operator = (const ConnectedComponent &D ) {
207  m_content_CC = D.m_content_CC;
208  d_sum = D.d_sum;
209  density = D.density;
210  }
213  void push_back(unsigned int i) {
214  m_content_CC.push_back(i);
215  }
216  };
217 
220  struct wedge {
221  unsigned int edge;
222  unsigned short weight;
223  };
224 
227  struct protein {
228  vector<wedge> edges;
229  unsigned int species_id;
230  string full_name;
231  };
232 
235  struct compare_ConnectedComponents { //sort from large to small
236  bool operator() (const ConnectedComponent &a, const ConnectedComponent &b) const {
237  return a.density < b.density;
238  }
239  };
240 
241 
245  {
246  inline bool operator() (const pair<string,int> & p1, const pair<string,int>& p2)
247  {
248  return (p1.second > p2.second);
249  }
250  };
251  // end of class
253 
254 
261  ConnectedComponent BFS(vector<bool> * done, unsigned int cur_node )
262 
263 
269  size_t getPeakRSS( );
270 
276  size_t getCurrentRSS( );
277 
279  // Main
281 
284  void printHelp() ;
285 
289  unsigned int numberOfNodesToMemoryUsageLaplacian_inKB(unsigned int n);
290 
294  unsigned int BFS_not_critical( vector<unsigned int> * done_withBacktrack, unsigned int start_node, unsigned int end_node );
295 
300 
301 
303  // Major partioning algorithm
305 
314  void partition_graph();
315 
317  // Basic Graph functions
319 
322  void clear_edges(vector<unsigned int>& nodes);
323 
328  void removeExternalEdges(map<unsigned int,bool>& a);
329 
332  void remove_edge_index(const unsigned int node_id, const unsigned int index);
333 
335  // File parser
337 
340  void parse_file(string file);
341 
343  // Output
345 
347  void sort_species(void);
348 
351  void stats(floattype i, floattype size);
352 
355  void print_header();
356 
359  void print_group(vector<unsigned int>& nodes, floattype connectivity);
360 
363  floattype calc_group(vector<unsigned int>& nodes);
364 
366  // Misc functions
368 
370  floattype string2floattype(string str);
371 
374  void tokenize(const string& str, vector<string>& tokens, const string& delimiters = "\t");
375 
377  // Algebraic connectivity functions
379 
381  unsigned int max_of_diag(vector<unsigned int>& nodes, vector<unsigned int>& diag);
382 
385  vector<floattype> generate_random_vector(const unsigned int size);
386 
389  vector<floattype> get_new_x(vector<floattype> x, vector<unsigned int>& nodes, map<unsigned int,unsigned int> &mapping, bool isWeighted);
390 
393  vector<floattype> makeOrthogonal(vector<floattype> x);
394 
397  vector<floattype> normalize(vector<floattype> x, floattype *length);
398 
401  vector<floattype> getY(floattype max_degree, vector<floattype> x_hat, vector<floattype> x_new, vector<unsigned int>& nodes, vector<unsigned int>& diag);
402 
405  unsigned int sumOutDegs(const vector<unsigned int>& nodes);
406 
412  floattype getConnectivity(vector<unsigned int>& nodes, bool useLapack);
413 
414  bool comparator_pairfloattypeUInt ( const pair<floattype,unsigned int>& l, const pair<floattype,unsigned int>& r );
415 
423  void splitGroups(vector<floattype>& y, vector<unsigned int>& nodes , bool useLapack);
424 
427  string getTime(void);
428 
431  bool test__max_of_diag();
432 
436 
439  bool test__get_new_x();
440 
443  bool test__makeOrthogonal();
444 
447  bool test__normalize();
448 
451  bool test__getY();
452 
455  bool test__splitGroups();
456  // end of debug
458 
459 
460 
461 #endif /* _PROTEINORTHOCLUSTERING */
ssyevr_
void ssyevr_(char *jobz, char *range, char *uplo, int *n, float *a, int *lda, float *vl, float *vu, int *il, int *iu, float *abstol, int *m, float *w, float *z, int *ldz, int *isuppz, float *work, int *lwork, int *iwork, int *liwork, int *info)
test__max_of_diag
bool test__max_of_diag()
criticalHeuristic
bool criticalHeuristic(ConnectedComponent *cur_cc)
ConnectedComponent::push_back
void push_back(unsigned int i)
Definition: proteinortho_clustering.h:213
wedge::weight
unsigned short weight
the weight of this given edge
Definition: proteinortho_clustering.h:222
getTime
string getTime(void)
protein::full_name
string full_name
the full input name of this protein
Definition: proteinortho_clustering.h:230
getConnectivity
floattype getConnectivity(vector< unsigned int > &nodes, bool useLapack)
splitGroups
void splitGroups(vector< floattype > &y, vector< unsigned int > &nodes, bool useLapack)
ConnectedComponent::ConnectedComponent
ConnectedComponent()
Definition: proteinortho_clustering.h:181
BFS_not_critical
unsigned int BFS_not_critical(vector< unsigned int > *done_withBacktrack, unsigned int start_node, unsigned int end_node)
dsyevr_
void dsyevr_(char *jobz, char *range, char *uplo, int *n, double *a, int *lda, double *vl, double *vu, int *il, int *iu, double *abstol, int *m, double *w, double *z, int *ldz, int *isuppz, double *work, int *lwork, int *iwork, int *liwork, int *info)
ConnectedComponent::density
double density
the graph density calculated in the function BFS (at the end)
Definition: proteinortho_clustering.h:177
ConnectedComponent::operator[]
const unsigned int & operator[](unsigned int i) const
Definition: proteinortho_clustering.h:194
get_new_x
vector< floattype > get_new_x(vector< floattype > x, vector< unsigned int > &nodes, map< unsigned int, unsigned int > &mapping, bool isWeighted)
param_verbose
bool param_verbose
By default no verbose is printed.
Definition: proteinortho_clustering.h:70
param_sep_purity
floattype param_sep_purity
as a reference: a-b-c will give +/-0.707107 and 2.34857e-08
Definition: proteinortho_clustering.h:73
reorder_table
vector< int > reorder_table
Tells how proteins/species must be sorted.
Definition: proteinortho_clustering.h:102
partition_graph
void partition_graph()
BFS
ConnectedComponent BFS(vector< bool > *done, unsigned int cur_node) size_t getPeakRSS()
param_useLapack
bool param_useLapack
If true, then lapack is used accordingly to param_lapack_power_threshold_d.
Definition: proteinortho_clustering.h:81
printHelp
void printHelp()
test__splitGroups
bool test__splitGroups()
compare_ConnectedComponents
Definition: proteinortho_clustering.h:235
critical_min_nodes
unsigned int critical_min_nodes
Depricated.
Definition: proteinortho_clustering.h:84
numberOfNodesToMemoryUsageLaplacian_inKB
unsigned int numberOfNodesToMemoryUsageLaplacian_inKB(unsigned int n)
removeExternalEdges
void removeExternalEdges(map< unsigned int, bool > &a)
protein::edges
vector< wedge > edges
all the adjacent nodes along with weights -> wedge
Definition: proteinortho_clustering.h:228
getCurrentRSS
size_t getCurrentRSS()
num_cpus
unsigned int num_cpus
By default only one core is used (change this with -cpus)
Definition: proteinortho_clustering.h:104
param_max_nodes
unsigned int param_max_nodes
= 2^24. The absolute maximal number of nodes for a connected component. If exceeded then it is skippe...
Definition: proteinortho_clustering.h:74
floattype
float floattype
Definition: proteinortho_clustering.h:33
param_lapack_power_threshold_d
floattype param_lapack_power_threshold_d
The minimum graph density for the power iteration method, lapacks (d|s)syevr is used otherwise....
Definition: proteinortho_clustering.h:92
test__normalize
bool test__normalize()
maxUseWeightsNumNodes
const unsigned int maxUseWeightsNumNodes
= 2^20. The maximum number of nodes for a connected component such that weights are still used for ca...
Definition: proteinortho_clustering.h:91
species_counter
unsigned int species_counter
Number of species (in the input graph)
Definition: proteinortho_clustering.h:95
param_useKmereHeuristic
bool param_useKmereHeuristic
If true then kmere-heuristic is used. For ambigous splits (e.g. all but one entrie of the fiedler vec...
Definition: proteinortho_clustering.h:79
protein_degree
Definition: proteinortho_clustering.h:244
sort_species
void sort_species(void)
print_group
void print_group(vector< unsigned int > &nodes, floattype connectivity)
remove_edge_index
void remove_edge_index(const unsigned int node_id, const unsigned int index)
debug_level
unsigned int debug_level
Debug stderr level.
Definition: proteinortho_clustering.h:72
param_min_species
floattype param_min_species
Minimum number of species per connected component.
Definition: proteinortho_clustering.h:75
param_max_iter
unsigned int param_max_iter
Maximal number of iterations for power iteration.
Definition: proteinortho_clustering.h:86
wedge::edge
unsigned int edge
the adjacent node index of variable graph
Definition: proteinortho_clustering.h:221
graph_clean
vector< shared_ptr< ofstream > > graph_clean
File handler to store graph data.
Definition: proteinortho_clustering.h:101
max_of_diag
unsigned int max_of_diag(vector< unsigned int > &nodes, vector< unsigned int > &diag)
normalize
vector< floattype > normalize(vector< floattype > x, floattype *length)
graph_ram_total_inKB
unsigned long graph_ram_total_inKB
The internal size of the input graph in KB (this will be set in parse_file() )
Definition: proteinortho_clustering.h:103
generate_random_vector
vector< floattype > generate_random_vector(const unsigned int size)
clear_edges
void clear_edges(vector< unsigned int > &nodes)
param_rmgraph
string param_rmgraph
Name of the remove graph, containing all edges that are removed by this clustering algorithm.
Definition: proteinortho_clustering.h:76
kmereHeuristic_minNumberOfGroups
const unsigned int kmereHeuristic_minNumberOfGroups
The minimum number of species for the kmere-heuristic.
Definition: proteinortho_clustering.h:90
param_epsilon
floattype param_epsilon
Epsilon for lapack functions dsyevr/ssyevr as well as the power iteration. Set analog to http://peopl...
Definition: proteinortho_clustering.h:87
edges
unsigned int edges
Total number of edges.
Definition: proteinortho_clustering.h:100
tokenize
void tokenize(const string &str, vector< string > &tokens, const string &delimiters="\t")
comparator_pairfloattypeUInt
bool comparator_pairfloattypeUInt(const pair< floattype, unsigned int > &l, const pair< floattype, unsigned int > &r)
test__generate_random_vector
bool test__generate_random_vector()
test__getY
bool test__getY()
wedge
Definition: proteinortho_clustering.h:220
stats
void stats(floattype i, floattype size)
protein_counter
unsigned int protein_counter
Number of proteins total (in the input graph)
Definition: proteinortho_clustering.h:96
protein::species_id
unsigned int species_id
the species id of this protein
Definition: proteinortho_clustering.h:229
min_iter
const unsigned int min_iter
Minimal number of iterations for power iteration.
Definition: proteinortho_clustering.h:85
kmereHeuristic_protPerSpecies
const unsigned int kmereHeuristic_protPerSpecies
The mimum number of proteins per species for the kmere-heuristic.
Definition: proteinortho_clustering.h:89
test__get_new_x
bool test__get_new_x()
ConnectedComponent
Definition: proteinortho_clustering.h:172
kmereHeuristic_minNodes
const unsigned int kmereHeuristic_minNodes
= 2^20. The minimum number of nodes of the connected component for the kmere-heuristic.
Definition: proteinortho_clustering.h:88
ConnectedComponent::m_content_CC
vector< unsigned int > m_content_CC
ids of the induced subgraph of the global variable graph. E.g. m_content_CC=[0,4,6] -> proteins graph...
Definition: proteinortho_clustering.h:175
protein
Definition: proteinortho_clustering.h:227
graph
vector< protein > graph
Graph containing all protein data (see the class protein for more informations)
Definition: proteinortho_clustering.h:98
ConnectedComponent::size
unsigned int size() const
Definition: proteinortho_clustering.h:202
calc_group
floattype calc_group(vector< unsigned int > &nodes)
last_stat
floattype last_stat
For buffering progress stats.
Definition: proteinortho_clustering.h:99
param_minOpenmp
unsigned int param_minOpenmp
the minimum size of a for-loop for openmp to activate (openmp has some initialization costs)
Definition: proteinortho_clustering.h:78
param_useWeights
bool param_useWeights
If true, then weights are used for the calculation of the algebraic connectivity.
Definition: proteinortho_clustering.h:77
ConnectedComponent::operator[]
unsigned int & operator[](unsigned int i)
Definition: proteinortho_clustering.h:188
print_header
void print_header()
ConnectedComponent::size
unsigned int size()
Definition: proteinortho_clustering.h:201
ConnectedComponent::d_sum
unsigned int d_sum
sum of node degrees
Definition: proteinortho_clustering.h:176
makeOrthogonal
vector< floattype > makeOrthogonal(vector< floattype > x)
string2floattype
floattype string2floattype(string str)
param_con_threshold
floattype param_con_threshold
as a reference: a chain a-b-c-d has 0.25
Definition: proteinortho_clustering.h:71
test__makeOrthogonal
bool test__makeOrthogonal()
getY
vector< floattype > getY(floattype max_degree, vector< floattype > x_hat, vector< floattype > x_new, vector< unsigned int > &nodes, vector< unsigned int > &diag)
sumOutDegs
unsigned int sumOutDegs(const vector< unsigned int > &nodes)
parse_file
void parse_file(string file)
dssyevr_
void dssyevr_(char *jobz, char *range, char *uplo, int *n, T *a, int *lda, T *vl, T *vu, int *il, int *iu, T *abstol, int *m, T *w, T *z, int *ldz, int *isuppz, T *work, int *lwork, int *iwork, int *liwork, int *info)
Definition: proteinortho_clustering.h:139
species
vector< string > species
Species ID (number) -> Species name.
Definition: proteinortho_clustering.h:97
param_maxRam_inKB
unsigned int param_maxRam_inKB
= 16 GB of memory as default. Decreasing this may lead to more power iterations and thus to a increas...
Definition: proteinortho_clustering.h:80