Actual source code: aijAssemble.cu
petsc-3.4.2 2013-07-02
1: #include "petscconf.h"
2: PETSC_CUDA_EXTERN_C_BEGIN
3: #include ../src/mat/impls/aij/seq/aij.h
4: #include petscbt.h
5: #include ../src/vec/vec/impls/dvecimpl.h
6: #include "petsc-private/vecimpl.h"
7: PETSC_CUDA_EXTERN_C_END
8: #undef VecType
9: #include ../src/mat/impls/aij/seq/seqcusp/cuspmatimpl.h
11: #include <thrust/reduce.h>
12: #include <thrust/inner_product.h>
14: #include <cusp/array1d.h>
15: #include <cusp/print.h>
16: #include <cusp/coo_matrix.h>
18: #include <cusp/io/matrix_market.h>
20: #include <thrust/iterator/counting_iterator.h>
21: #include <thrust/iterator/transform_iterator.h>
22: #include <thrust/iterator/permutation_iterator.h>
23: #include <thrust/functional.h>
25: // this example illustrates how to make repeated access to a range of values
26: // examples:
27: // repeated_range([0, 1, 2, 3], 1) -> [0, 1, 2, 3]
28: // repeated_range([0, 1, 2, 3], 2) -> [0, 0, 1, 1, 2, 2, 3, 3]
29: // repeated_range([0, 1, 2, 3], 3) -> [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
30: // ...
32: template <typename Iterator>
33: class repeated_range
34: {
35: public:
37: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
39: struct repeat_functor : public thrust::unary_function<difference_type,difference_type>
40: {
41: difference_type repeats;
43: repeat_functor(difference_type repeats) : repeats(repeats) {}
45: __host__ __device__
46: difference_type operator()(const difference_type &i) const {
47: return i / repeats;
48: }
49: };
51: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
52: typedef typename thrust::transform_iterator<repeat_functor, CountingIterator> TransformIterator;
53: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
55: // type of the repeated_range iterator
56: typedef PermutationIterator iterator;
58: // construct repeated_range for the range [first,last)
59: repeated_range(Iterator first, Iterator last, difference_type repeats) : first(first), last(last), repeats(repeats) {}
61: iterator begin(void) const
62: {
63: return PermutationIterator(first, TransformIterator(CountingIterator(0), repeat_functor(repeats)));
64: }
66: iterator end(void) const
67: {
68: return begin() + repeats * (last - first);
69: }
71: protected:
72: difference_type repeats;
73: Iterator first;
74: Iterator last;
76: };
78: // this example illustrates how to repeat blocks in a range multiple times
79: // examples:
80: // tiled_range([0, 1, 2, 3], 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
81: // tiled_range([0, 1, 2, 3], 4, 2) -> [0, 1, 2, 3, 0, 1, 2, 3]
82: // tiled_range([0, 1, 2, 3], 2, 2) -> [0, 1, 0, 1, 2, 3, 2, 3]
83: // tiled_range([0, 1, 2, 3], 2, 3) -> [0, 1, 0, 1 0, 1, 2, 3, 2, 3, 2, 3]
84: // ...
86: template <typename Iterator>
87: class tiled_range
88: {
89: public:
91: typedef typename thrust::iterator_difference<Iterator>::type difference_type;
93: struct tile_functor : public thrust::unary_function<difference_type,difference_type>
94: {
95: difference_type repeats;
96: difference_type tile_size;
98: tile_functor(difference_type repeats, difference_type tile_size) : tile_size(tile_size), repeats(repeats) {}
100: __host__ __device__
101: difference_type operator()(const difference_type &i) const {
102: return tile_size * (i / (tile_size * repeats)) + i % tile_size;
103: }
104: };
106: typedef typename thrust::counting_iterator<difference_type> CountingIterator;
107: typedef typename thrust::transform_iterator<tile_functor, CountingIterator> TransformIterator;
108: typedef typename thrust::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
110: // type of the tiled_range iterator
111: typedef PermutationIterator iterator;
113: // construct repeated_range for the range [first,last)
114: tiled_range(Iterator first, Iterator last, difference_type repeats)
115: : first(first), last(last), repeats(repeats), tile_size(last - first) {}
117: tiled_range(Iterator first, Iterator last, difference_type repeats, difference_type tile_size)
118: : first(first), last(last), repeats(repeats), tile_size(tile_size)
119: {
120: // ASSERT((last - first) % tile_size == 0)
121: }
123: iterator begin(void) const
124: {
125: return PermutationIterator(first, TransformIterator(CountingIterator(0), tile_functor(repeats, tile_size)));
126: }
128: iterator end(void) const
129: {
130: return begin() + repeats * (last - first);
131: }
133: protected:
134: difference_type repeats;
135: difference_type tile_size;
136: Iterator first;
137: Iterator last;
138: };
140: typedef cusp::device_memory memSpace;
141: typedef int IndexType;
142: typedef PetscScalar ValueType;
143: typedef cusp::array1d<IndexType, memSpace> IndexArray;
144: typedef cusp::array1d<ValueType, memSpace> ValueArray;
145: typedef IndexArray::iterator IndexArrayIterator;
146: typedef ValueArray::iterator ValueArrayIterator;
148: // Ne: Number of elements
149: // Nl: Number of dof per element
152: PetscErrorCode MatSetValuesBatch_SeqAIJCUSP(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats)
153: {
154: size_t N = Ne * Nl;
155: size_t No = Ne * Nl*Nl;
156: PetscInt Nr; // Number of rows
159: // copy elemRows and elemMat to device
160: IndexArray d_elemRows(elemRows, elemRows + N);
161: ValueArray d_elemMats(elemMats, elemMats + No);
164: MatGetSize(J, &Nr, NULL);
165: // allocate storage for "fat" COO representation of matrix
166: PetscInfo1(J, "Making COO matrix of size %d\n", Nr);
167: cusp::coo_matrix<IndexType,ValueType, memSpace> COO(Nr, Nr, No);
169: // repeat elemRows entries Nl times
170: PetscInfo(J, "Making row indices\n");
171: repeated_range<IndexArrayIterator> rowInd(d_elemRows.begin(), d_elemRows.end(), Nl);
172: thrust::copy(rowInd.begin(), rowInd.end(), COO.row_indices.begin());
174: // tile rows of elemRows Nl times
175: PetscInfo(J, "Making column indices\n");
176: tiled_range<IndexArrayIterator> colInd(d_elemRows.begin(), d_elemRows.end(), Nl, Nl);
177: thrust::copy(colInd.begin(), colInd.end(), COO.column_indices.begin());
179: // copy values from elemMats into COO structure (could be avoided)
180: thrust::copy(d_elemMats.begin(), d_elemMats.end(), COO.values.begin());
182: // For MPIAIJ, split this into two COO matrices, and return both
183: // Need the column map
185: // print the "fat" COO representation
186: #if !defined(PETSC_USE_COMPLEX)
187: if (PetscLogPrintInfo) cusp::print(COO);
188: #endif
189: // sort COO format by (i,j), this is the most costly step
190: PetscInfo(J, "Sorting rows and columns\n");
191: #if 1
192: COO.sort_by_row_and_column();
193: #else
194: {
195: PetscInfo(J, " Making permutation\n");
196: IndexArray permutation(No);
197: thrust::sequence(permutation.begin(), permutation.end());
199: // compute permutation and sort by (I,J)
200: {
201: PetscInfo(J, " Sorting columns\n");
202: IndexArray temp(No);
203: thrust::copy(COO.column_indices.begin(), COO.column_indices.end(), temp.begin());
204: thrust::stable_sort_by_key(temp.begin(), temp.end(), permutation.begin());
205: PetscInfo(J, " Sorted columns\n");
206: if (PetscLogPrintInfo) {
207: for (IndexArrayIterator t_iter = temp.begin(), p_iter = permutation.begin(); t_iter != temp.end(); ++t_iter, ++p_iter) {
208: PetscInfo2(J, "%d(%d)\n", *t_iter, *p_iter);
209: }
210: }
212: PetscInfo(J, " Copying rows\n");
213: //cusp::copy(COO.row_indices, temp);
214: thrust::copy(COO.row_indices.begin(), COO.row_indices.end(), temp.begin());
215: PetscInfo(J, " Gathering rows\n");
216: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.row_indices.begin());
217: PetscInfo(J, " Sorting rows\n");
218: thrust::stable_sort_by_key(COO.row_indices.begin(), COO.row_indices.end(), permutation.begin());
220: PetscInfo(J, " Gathering columns\n");
221: cusp::copy(COO.column_indices, temp);
222: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.column_indices.begin());
223: }
225: // use permutation to reorder the values
226: {
227: PetscInfo(J, " Sorting values\n");
228: ValueArray temp(COO.values);
229: cusp::copy(COO.values, temp);
230: thrust::gather(permutation.begin(), permutation.end(), temp.begin(), COO.values.begin());
231: }
232: }
233: #endif
235: // print the "fat" COO representation
236: #if !defined(PETSC_USE_COMPLEX)
237: if (PetscLogPrintInfo) cusp::print(COO);
238: #endif
239: // compute number of unique (i,j) entries
240: // this counts the number of changes as we move along the (i,j) list
241: PetscInfo(J, "Computing number of unique entries\n");
242: size_t num_entries = thrust::inner_product
243: (thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
244: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end (), COO.column_indices.end())) - 1,
245: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())) + 1,
246: size_t(1),
247: thrust::plus<size_t>(),
248: thrust::not_equal_to< thrust::tuple<IndexType,IndexType> >());
250: // allocate COO storage for final matrix
251: PetscInfo(J, "Allocating compressed matrix\n");
252: cusp::coo_matrix<IndexType, ValueType, memSpace> A(Nr, Nr, num_entries);
254: // sum values with the same (i,j) index
255: // XXX thrust::reduce_by_key is unoptimized right now, so we provide a SpMV-based one in cusp::detail
256: // the Cusp one is 2x faster, but still not optimal
257: // This could possibly be done in-place
258: PetscInfo(J, "Compressing matrix\n");
259: thrust::reduce_by_key(thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.begin(), COO.column_indices.begin())),
260: thrust::make_zip_iterator(thrust::make_tuple(COO.row_indices.end(), COO.column_indices.end())),
261: COO.values.begin(),
262: thrust::make_zip_iterator(thrust::make_tuple(A.row_indices.begin(), A.column_indices.begin())),
263: A.values.begin(),
264: thrust::equal_to< thrust::tuple<IndexType,IndexType> >(),
265: thrust::plus<ValueType>());
267: // print the final matrix
268: #if !defined(PETSC_USE_COMPLEX)
269: if (PetscLogPrintInfo) cusp::print(A);
270: #endif
271: //std::cout << "Writing matrix" << std::endl;
272: //cusp::io::write_matrix_market_file(A, "A.mtx");
274: PetscInfo(J, "Converting to PETSc matrix\n");
275: MatSetType(J, MATSEQAIJCUSP);
276: //cusp::csr_matrix<PetscInt,PetscScalar,cusp::device_memory> Jgpu;
277: CUSPMATRIX *Jgpu = new CUSPMATRIX;
278: cusp::convert(A, *Jgpu);
279: #if !defined(PETSC_USE_COMPLEX)
280: if (PetscLogPrintInfo) cusp::print(*Jgpu);
281: #endif
282: PetscInfo(J, "Copying to CPU matrix\n");
283: MatCUSPCopyFromGPU(J, Jgpu);
284: return(0);
285: }