"""
Two Point Functions Framework.
"""
#-----------------------------------------------------------------------------
# Copyright (c) 2013, yt Development Team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
#-----------------------------------------------------------------------------
import h5py
from yt.mods import *
#from yt.utilities.math_utils import *
from yt.utilities.performance_counters import yt_counters, time_function
from yt.utilities.parallel_tools.parallel_analysis_interface import ParallelAnalysisInterface, parallel_blocking_call, parallel_root_only
try:
from yt.utilities.kdtree.api import *
except ImportError:
mylog.debug("The Fortran kD-Tree did not import correctly.")
import math, sys, itertools, inspect, types, time
from collections import defaultdict
sep = 12
[docs]class TwoPointFunctions(ParallelAnalysisInterface):
r""" Initialize a two point functions object.
Parameters
----------
total_values : Integer
How many total (global) pair calculations to run for each of the
functions specified. Default: 1000000.
comm_size : Integer
How entries are sent during communication. Default: 10000.
length_type : String
Controls the even spacing of the rulers lengths in
logarithmic or linear space, set by "log" or "lin", respectively.
Default: "lin".
length_number : Integer
Sets how many lengths to create, evenly spaced by the above
parameter. Default: 10.
length_range : Float
A min/max pair for the range of values to search the over
the simulational volume. Default: [sqrt(3)dx, 1/2*shortest box edge],
where dx is the smallest grid cell size.
vol_ratio : Integer
How to multiply-assign subvolumes to the parallel
tasks. This number must be an integer factor of the total number of tasks or
very bad things will happen. The default value of 1 will assign one task
to each subvolume, and there will be an equal number of subvolumes as tasks.
A value of 2 will assign two tasks to each subvolume and there will be
one-half as many subvolumes as tasks.
A value equal to the number of parallel tasks will result in each task
owning a complete copy of all the fields data, meaning each task will be
operating on the identical full volume.
Setting it to -1 will automatically adjust it such that each task
owns the entire volume. Default = 1.
salt : Integer
A number that will be added to the random number generator
seed. Use this if a different random series of numbers is desired when
keeping everything else constant from this set: (MPI task count,
number of ruler lengths, ruler min/max, number of functions,
number of point pairs per ruler length). Default = 0.
theta : Float
For random pairs of points, the second point is found by traversing
a distance along a ray set by the angle (phi, theta) from the first
point. To keep this angle constant, set ``theta`` to a value in the
range [0, pi]. Default = None, which will randomize theta for
every pair of points.
phi : Float
Similar to theta above, but the range of values is [0, 2*pi).
Default = None, which will randomize phi for every pair of points.
Examples
--------
>>> tpf = TwoPointFunctions(ds, ["velocity_x", "velocity_y", "velocity_z"],
... total_values=1e5, comm_size=10000,
... length_number=10, length_range=[1./128, .5],
... length_type="log")
"""
[docs] def __init__(self, ds, fields, left_edge=None, right_edge=None,
total_values=1000000, comm_size=10000, length_type="lin",
length_number=10, length_range=None, vol_ratio = 1,
salt=0, theta=None, phi=None):
ParallelAnalysisInterface.__init__(self)
try:
fKD
except NameError:
raise ImportError("You need to install the Forthon kD-Tree")
self._fsets = []
self.fields = fields
self.constant_theta = theta
self.constant_phi = phi
# MPI stuff.
self.size = self.comm.size
self.mine = self.comm.rank
self.vol_ratio = vol_ratio
if self.vol_ratio == -1:
self.vol_ratio = self.size
self.total_values = int(total_values / self.size)
# For communication.
self.recv_hooks = []
self.send_hooks = []
self.done_hooks = []
self.comm_size = min(int(comm_size), self.total_values)
self.ds = ds
self.nlevels = ds.index.max_level
self.period = self.ds.domain_right_edge - self.ds.domain_left_edge
self.min_edge = min(self.period)
self.index = ds.index
self.center = (ds.domain_right_edge + ds.domain_left_edge)/2.0
# Figure out the range of ruler lengths.
if length_range == None:
length_range = [math.sqrt(3) * self.ds.index.get_smallest_dx(),
self.min_edge/2.]
else:
if len(length_range) != 2:
raise ValueError("length_range must have two values.")
if length_range[1] <= length_range[0]:
raise ValueError("length_range[1] must be larger than length_range[0]")
if length_range[1] > self.min_edge/2.:
length_range[1] = self.min_edge/2.
mylog.info("Automatically adjusting length_range[1] to half the shortest box edge.")
if length_range[0] == -1 or length_range[0] == -1.:
mylog.info("Automatically adjusting length_range[0] to %1.5e." % \
(math.sqrt(3) * self.ds.index.get_smallest_dx()))
length_range[0] = math.sqrt(3) * self.ds.index.get_smallest_dx()
# Make the list of ruler lengths.
if length_type == "lin":
self.lengths = np.linspace(length_range[0], length_range[1],
length_number)
elif length_type == "log":
self.lengths = np.logspace(math.log10(length_range[0]),
math.log10(length_range[1]), length_number)
else:
# Something went wrong.
raise SyntaxError("length_type is either \"lin\" or \"log\".")
# Subdivide the volume.
if not left_edge or not right_edge:
self.left_edge = self.ds.domain_left_edge
self.right_edge = self.ds.domain_right_edge
# This ds business below has to do with changes made for halo
# finding on subvolumes and serves no purpose here except
# compatibility. This is not the best policy, if I'm honest.
ds = ds.region([0.]*3, self.left_edge, self.right_edge)
padded, self.LE, self.RE, self.ds = \
self.partition_index_3d(ds = ds, padding=0.,
rank_ratio = self.vol_ratio)
else:
self.left_edge = left_edge
self.right_edge = right_edge
# We do this twice, first with no 'buffer' to get the unbuffered
# self.LE/RE, and then second to get a buffered self.ds.
padded, self.LE, self.RE, temp = \
self.partition_region_3d(left_edge, right_edge,
rank_ratio=self.vol_ratio)
padded, temp, temp, self.ds = \
self.partition_region_3d(left_edge - self.lengths[-1], \
right_edge + self.lengths[-1], rank_ratio=self.vol_ratio)
mylog.info("LE %s RE %s %s" % (str(self.LE), str(self.RE), str(self.ds)))
self.width = self.ds.right_edge - self.ds.left_edge
self.mt = np.random.mtrand.RandomState(seed = 1234 * self.mine + salt)
[docs] def add_function(self, function, out_labels, sqrt, corr_norm=None):
r"""Add a function to the list that will be evaluated at the
generated pairs of points.
Parameters
----------
function : Function
The two point function of the form fcn(a, b, r1, r2, vec).
out_labels : List of strings
A list of strings labeling the outputs of the function.
sqrt : List of booleans
A list of booleans which when True will square-root the corresponding
element of the output in the text output (write_out_means()).
corr_norm : Float
Used when calculating two point correlations. If set, the output
of the function is divided by this number. Default = None.
Examples
--------
>>> f1 = tpf.add_function(function=rms_vel, out_labels=['RMSvdiff'],
... sqrt=[True])
"""
fargs = inspect.getargspec(function)
if len(fargs.args) != 5:
raise SyntaxError("The function %s needs five arguments." %\
function.__name__)
out_labels = list(out_labels)
if len(out_labels) < 1:
raise SyntaxError("Please specify at least one out_labels for function %s." %\
function.__name__)
sqrt = list(sqrt)
if len(sqrt) != len(out_labels):
raise SyntaxError("Please have the same number of elements in out_labels as in sqrt for function %s." %\
function.__name__)
self._fsets.append(FcnSet(self, function, self.min_edge,
out_labels, sqrt,corr_norm))
return self._fsets[-1]
def __getitem__(self, key):
return self._fsets[key]
[docs] def run_generator(self):
r"""After all the functions have been added, run the generator.
Examples
--------
>>> tpf.run_generator()
"""
yt_counters("run_generator")
# We need a function!
if len(self._fsets) == 0:
mylog.error("You need to add at least one function!")
return None
# Do all the startup tasks to get the grid points.
if self.nlevels == 0:
yt_counters("build_sort")
self._build_sort_array()
self.sort_done = False
yt_counters("build_sort")
else:
yt_counters("init_kd_tree")
self._init_kd_tree()
self.sort_done = True
yt_counters("init_kd_tree")
# Store the fields.
self.stored_fields = {}
yt_counters("getting data")
for field in self.fields:
self.stored_fields[field] = self.ds[field].copy()
self.ds.clear_data()
# If the arrays haven't been sorted yet and need to be, do that.
if not self.sort_done:
for field in self.fields:
self.stored_fields[field] = self.stored_fields[field][self.sort]
del self.sort
self.sort_done = True
yt_counters("getting data")
self._build_fields_vals()
yt_counters("big loop over lengths")
t_waiting = 0.
for bigloop, length in enumerate(self.lengths):
self._build_points_array()
if self.mine == 0:
mylog.info("Doing length %1.5e" % length)
# Things stop when this value below equals total_values.
self.generated_points = 0
self.gen_array = np.zeros(self.size, dtype='int64')
self.comm_cycle_count = 0
self.final_comm_cycle_count = 0
self.sent_done = False
self._setup_done_hooks_on_root()
# While everyone else isn't done or I'm not done, we loop.
while self._should_cycle():
self._setup_recv_arrays()
self._send_arrays()
t0 = time.time()
self.comm.mpi_Request_Waitall(self.send_hooks)
self.comm.mpi_Request_Waitall(self.recv_hooks)
t1 = time.time()
t_waiting += (t1-t0)
if (self.recv_points < -1.).any() or (self.recv_points > 1.).any(): # or \
#(np.abs(np.log10(np.abs(self.recv_points))) > 20).any():
raise ValueError("self.recv_points is no good!")
self.points = self.recv_points.copy()
self.fields_vals = self.recv_fields_vals.copy()
self.gen_array = self.recv_gen_array.copy()
self._eval_points(length)
self.gen_array[self.mine] = self.generated_points
self.comm_cycle_count += 1
if self.generated_points == self.total_values:
self._send_done_to_root()
if self.mine == 0:
mylog.info("Length (%d of %d) %1.5e took %d communication cycles to complete." % \
(bigloop+1, len(self.lengths), length, self.comm_cycle_count))
yt_counters("big loop over lengths")
if self.nlevels >= 1:
del fKD.pos, fKD.qv_many, fKD.nn_tags
free_tree(0) # Frees the kdtree object.
yt_counters("allsum")
self._allsum_bin_hits()
mylog.info("Spent %f seconds waiting for communication." % t_waiting)
yt_counters("allsum")
yt_counters("run_generator")
def _init_kd_tree(self):
"""
Builds the kd tree of grid center points.
"""
# Grid cell centers.
mylog.info("Multigrid: Building kD-Tree.")
xp = self.ds["x"]
yp = self.ds["y"]
zp = self.ds["z"]
fKD.pos = np.asfortranarray(np.empty((3,xp.size), dtype='float64'))
# Normalize the grid points only within the kdtree.
fKD.pos[0, :] = xp[:] / self.period[0]
fKD.pos[1, :] = yp[:] / self.period[1]
fKD.pos[2, :] = zp[:] / self.period[2]
fKD.nn = 1
fKD.sort = False
fKD.rearrange = True
create_tree(0)
def _build_sort_array(self):
"""
When running on a unigrid simulation, the kD tree isn't necessary.
But we need to ensure that the points are sorted in the usual manner
allowing values to be found via array indices.
"""
mylog.info("Unigrid: finding cell centers.")
xp = self.ds["x"]
yp = self.ds["y"]
zp = self.ds["z"]
self.sizes = [np.unique(xp).size, np.unique(yp).size, np.unique(zp).size]
self.sort = np.lexsort([zp, yp, xp])
del xp, yp, zp
self.ds.clear_data()
def _build_fields_vals(self):
"""
Builds an array to store the field values array.
"""
self.fields_vals = np.empty((self.comm_size, len(self.fields)*2), \
dtype='float64')
# At the same time build a dict to label the columns.
self.fields_columns = {}
for i,field in enumerate(self.fields):
self.fields_columns[field] = i
def _build_points_array(self):
"""
Initializes the array that contains the random points as all negatives
to start with.
"""
self.points = np.ones((self.comm_size, 6), dtype='float64') * -1.0
def _setup_done_hooks_on_root(self):
"""
Opens non-blocking receives on root pointing to all the other tasks
"""
if self.mine != 0:
return
self.recv_done = {}
for task in range(self.size):
if task == self.mine: continue
self.recv_done[task] = np.zeros(1, dtype='int64')
self.done_hooks.append(self.comm.mpi_nonblocking_recv(self.recv_done[task], \
task, tag=15))
def _send_done_to_root(self):
"""
Tell the root process that I'm done.
"""
# If I've already done this, don't do it again.
if self.sent_done: return
if self.mine !=0:
# I send when I *think* things should finish.
self.send_done = np.ones(1, dtype='int64') * \
(self.size / self.vol_ratio -1) + self.comm_cycle_count
self.done_hooks.append(self.comm.mpi_nonblocking_send(self.send_done, \
0, tag=15))
else:
# As root, I need to mark myself!
self.recv_done[0] = np.ones(1, dtype='int64') * \
(self.size / self.vol_ratio -1) + self.comm_cycle_count
self.sent_done = True
def _should_cycle(self):
"""
Determine if I should continue cycling the communication.
"""
if self.mine == 0:
# If other tasks aren't finished, this will return False.
status = self.comm.mpi_Request_Testall(self.done_hooks)
# Convolve this with with root's status.
status = status * (self.generated_points == self.total_values)
if status == 1:
# If they are all finished, meaning Testall returns True,
# and root has made its points, we find
# the biggest value in self.recv_done and stop there.
status = max(self.recv_done.values())
else:
status = 0
# Broadcast the status from root - we stop only if root thinks we should
# stop.
status = self.comm.mpi_bcast(status)
if status == 0: return True
if self.comm_cycle_count < status:
return True
# If we've come this far, we're done.
return False
def _setup_recv_arrays(self):
"""
Creates the recv buffers and calls a non-blocking MPI receive pointing
to the left-hand neighbor.
"""
self.recv_points = np.ones((self.comm_size, 6), dtype='float64') * -1.
self.recv_fields_vals = np.zeros((self.comm_size, len(self.fields)*2), \
dtype='float64')
self.recv_gen_array = np.zeros(self.size, dtype='int64')
self.recv_hooks.append(self.comm.mpi_nonblocking_recv(self.recv_points, \
(self.mine-1)%self.size, tag=10))
self.recv_hooks.append(self.comm.mpi_nonblocking_recv(self.recv_fields_vals, \
(self.mine-1)%self.size, tag=20))
self.recv_hooks.append(self.comm.mpi_nonblocking_recv(self.recv_gen_array, \
(self.mine-1)%self.size, tag=40))
def _send_arrays(self):
"""
Send the data arrays to the right-hand neighbor.
"""
self.send_hooks.append(self.comm.mpi_nonblocking_send(self.points,\
(self.mine+1)%self.size, tag=10))
self.send_hooks.append(self.comm.mpi_nonblocking_send(self.fields_vals,\
(self.mine+1)%self.size, tag=20))
self.send_hooks.append(self.comm.mpi_nonblocking_send(self.gen_array, \
(self.mine+1)%self.size, tag=40))
def _allsum_bin_hits(self):
"""
Add up the hits to all the bins globally for all functions.
"""
for fset in self._fsets:
fset.too_low = self.comm.mpi_allreduce(fset.too_low, op='sum')
fset.too_high = self.comm.mpi_allreduce(fset.too_high, op='sum')
fset.binned = {}
if self.mine == 0:
mylog.info("Function %s had values out of range for these fields:" % \
fset.function.__name__)
for i,field in enumerate(fset.out_labels):
mylog.info("Field %s had %d values too high and %d too low that were not binned." % \
(field, fset.too_high[i], fset.too_low[i]))
for length in self.lengths:
fset.length_bin_hits[length] = \
self.comm.mpi_allreduce(fset.length_bin_hits[length], op='sum')
# Find out how many were successfully binned.
fset.binned[length] = fset.length_bin_hits[length].sum()
# Normalize the counts.
fset.length_bin_hits[length] = \
fset.length_bin_hits[length].astype('float64') / \
fset.binned[length]
# Return it to its original shape.
fset.length_bin_hits[length] = \
fset.length_bin_hits[length].reshape(fset.bin_number)
def _pick_random_points(self, length, size):
"""
Picks out size random pairs separated by length *length*.
"""
# First make random points inside this subvolume.
r1 = np.empty((size,3), dtype='float64')
for dim in range(3):
r1[:,dim] = self.mt.uniform(low=self.ds.left_edge[dim],
high=self.ds.right_edge[dim], size=size)
# Next we find the second point, determined by a random
# theta, phi angle. See Eqns. 1 & 2 from
# http://mathworld.wolfram.com/SpherePointPicking.html,
# but phi and theta are switched to the Physics convention.
if self.constant_phi is None:
phi = self.mt.uniform(low=0, high=2.*math.pi, size=size)
else: phi = self.constant_phi * np.ones(size, dtype='float64')
if self.constant_theta is None:
v = self.mt.uniform(low=0., high=1, size=size)
theta = np.arccos(2 * v - 1)
else: theta = self.constant_theta * np.ones(size, dtype='float64')
r2 = np.empty((size,3), dtype='float64')
r2[:,0] = r1[:,0] + length * np.cos(phi) * np.sin(theta)
r2[:,1] = r1[:,1] + length * np.sin(phi) * np.sin(theta)
r2[:,2] = r1[:,2] + length * np.cos(theta)
# Reflect so it's inside the (full) volume.
r2 %= self.period
return (r1, r2)
def _find_nearest_cell(self, points):
"""
Finds the closest grid cell for each point in a vectorized manner.
"""
if self.nlevels == 0:
pos = (points - self.ds.left_edge) / self.width
n = (self.sizes[2] * pos[:,2]).astype('int32')
n += self.sizes[2] * (self.sizes[1] * pos[:,1]).astype('int32')
n += self.sizes[2] * self.sizes[1] * (self.sizes[0] * pos[:,0]).astype('int32')
else:
# Normalize the points to a 1-period for use only within the kdtree.
points[:, 0] = points[:, 0] / self.period[0]
points[:, 1] = points[:, 1] / self.period[1]
points[:, 2] = points[:, 2] / self.period[2]
fKD.qv_many = points.T
fKD.nn_tags = np.asfortranarray(np.empty((1, points.shape[0]), dtype='int64'))
fKD.find_many_nn_nearest_neighbors()
# The -1 is for fortran counting.
n = fKD.nn_tags[0,:] - 1
return n
def _get_fields_vals(self, points):
"""
Given points, return the values for the fields we need for those
points.
"""
# First find the grid data index field.
indices = self._find_nearest_cell(points)
results = np.empty((len(indices), len(self.fields)), dtype='float64')
# Put the field values into the columns of results.
for field in self.fields:
col = self.fields_columns[field]
results[:, col] = self.stored_fields[field][indices]
return results
def _eval_points(self, length):
# We need to loop over the points array at least once. Further
# iterations only happen if we have added new points to the array,
# but not as many as we want to, so we need to check again to see if
# we can put more points into the buffer.
added_points = True
while added_points:
# If we need to, add more points to the points array.
if self.generated_points < self.total_values:
# Look for 'empty' slots to put in new pairs.
select = (self.points[:,0] < 0)
ssum = select.sum()
# We'll generate only as many points as we need to/can.
size = min(ssum, self.total_values - self.generated_points)
(new_r1,new_r2) = self._pick_random_points(length, size)
self.generated_points += size
# If size != select.sum(), we need to pad the end of new_r1/r2
# which is what is effectively happening below.
newpoints = np.ones((ssum, 6), dtype='float64') * -1.
newpoints[:size,:3] = new_r1
newpoints[:size,3:] = new_r2
# Now we insert them into self.points.
self.points[select] = newpoints
else:
added_points = False
# If we have an empty buffer here, we can skip everything below.
if (self.points < 0).all():
added_points = False # Not strictly required, but clearer.
break
# Now we have a points array that is either full of unevaluated points,
# or I don't need to make any new points and I'm just processing the
# array. Start by finding the indices of the points I own.
self.points.shape = (self.comm_size*2, 3) # Doesn't make a copy - fast!
select = np.bitwise_or((self.points < self.ds.left_edge).any(axis=1),
(self.points >= self.ds.right_edge).any(axis=1))
select = np.invert(select)
mypoints = self.points[select]
if mypoints.size > 0:
# Get the fields values.
results = self._get_fields_vals(mypoints)
# Put this into self.fields_vals.
self.fields_vals.shape = (self.comm_size*2, len(self.fields))
self.fields_vals[select] = results
# Put our arrays back into their original shapes cheaply!
if mypoints.size > 0:
self.fields_vals.shape = (self.comm_size, len(self.fields)*2)
self.points.shape = (self.comm_size, 6)
# To run the functions, what is key is that the
# second point in the pair is ours.
second_points = self.points[:,3:]
select = np.bitwise_or((second_points < self.ds.left_edge).any(axis=1),
(second_points >= self.ds.right_edge).any(axis=1))
select = np.invert(select)
if select.any():
points_to_eval = self.points[select]
fields_to_eval = self.fields_vals[select]
# Find the normal vector between our points.
vec = np.abs(points_to_eval[:,:3] - points_to_eval[:,3:])
norm = np.sqrt(np.sum(np.multiply(vec,vec), axis=1))
# I wish there was a better way to do this, but I can't find it.
for i, n in enumerate(norm):
vec[i] = np.divide(vec[i], n)
# Now evaluate the functions.
for fcn_set in self._fsets:
fcn_results = fcn_set._eval_st_fcn(fields_to_eval,points_to_eval,
vec)
fcn_set._bin_results(length, fcn_results)
# Now clear the buffers at the processed points.
self.points[select] = np.array([-1.]*6, dtype='float64')
else:
# We didn't clear any points, so we should move on with our
# lives and pass this buffer along.
added_points = False
@parallel_blocking_call
[docs] def write_out_means(self, fn = "%s.txt"):
r"""Writes out the weighted-average value for each function for
each dimension for each ruler length to a text file. The data is written
to files of the name 'function_name.txt' in the current working
directory.
Examples
--------
>>> tpf.write_out_means()
"""
for fset in self._fsets:
fp = self.comm.write_on_root(fn % fset.function.__name__)
fset._avg_bin_hits()
line = "# length".ljust(sep)
line += "count".ljust(sep)
for dim in fset.dims:
line += ("%s" % fset.out_labels[dim]).ljust(sep)
fp.write(line + "\n")
for length in self.lengths:
line = ("%1.5e" % length).ljust(sep)
line += ("%d" % fset.binned[length]).ljust(sep)
for dim in fset.dims:
if fset.sqrt[dim]:
line += ("%1.5e" % \
math.sqrt(fset.length_avgs[length][dim])).ljust(sep)
else:
line += ("%1.5e" % \
fset.length_avgs[length][dim]).ljust(sep)
line += "\n"
fp.write(line)
fp.close()
@parallel_root_only
[docs] def write_out_arrays(self, fn = "%s.h5"):
r"""Writes out the raw probability bins and the bin edges to an HDF5 file
for each of the functions. The files are named
'function_name.txt' and saved in the current working directory.
Examples
--------
>>> tpf.write_out_arrays()
"""
if self.mine == 0:
for fset in self._fsets:
f = h5py.File(fn % fset.function.__name__, "w")
bin_names = []
prob_names = []
bin_counts = []
for dim in fset.dims:
f.create_dataset("/bin_edges_%02d_%s" % \
(dim, fset.out_labels[dim]), \
data=fset.bin_edges[dim])
bin_names.append("/bin_edges_%02d_%s" % \
(dim, fset.out_labels[dim]))
for i,length in enumerate(self.lengths):
f.create_dataset("/prob_bins_%05d" % i, \
data=fset.length_bin_hits[length])
prob_names.append("/prob_bins_%05d" % i)
bin_counts.append([fset.too_low.sum(), fset.binned[length],
fset.too_high.sum()])
f.create_dataset("/bin_edges_names", data=bin_names)
#f.create_dataset("/prob_names", data=prob_names)
f.create_dataset("/lengths", data=self.lengths)
f.create_dataset("/counts", data=bin_counts)
f.close()
@parallel_root_only
[docs] def write_out_correlation(self):
r"""A special output function for doing two point correlation functions.
Outputs the correlation function xi(r) in a text file
'function_name_corr.txt' in the current working directory.
Examples
--------
>>> tpf.write_out_correlation()
"""
for fset in self._fsets:
# Only operate on correlation functions.
if fset.corr_norm == None: continue
fp = self.comm.write_on_root("%s_correlation.txt" % fset.function.__name__)
line = "# length".ljust(sep)
line += "\\xi".ljust(sep)
fp.write(line + "\n")
xi = fset._corr_sum_norm()
for length in self.lengths:
line = ("%1.5e" % length).ljust(sep)
line += ("%1.5e" % xi[length]).ljust(sep)
fp.write(line + "\n")
fp.close()
[docs]class FcnSet(TwoPointFunctions):
[docs] def __init__(self,tpf, function, min_edge, out_labels, sqrt, corr_norm):
self.tpf = tpf # The overarching TPF class
self.function = function # Function to eval between the two points.
self.min_edge = min_edge # The length of the minimum edge of the box.
self.out_labels = out_labels # For output.
self.sqrt = sqrt # which columns to sqrt on output.
self.corr_norm = corr_norm # A number used to normalize a correlation function.
# These below are used to track how many times the function returns
# unbinned results.
self.too_low = np.zeros(len(self.out_labels), dtype='int32')
self.too_high = np.zeros(len(self.out_labels), dtype='int32')
[docs] def set_pdf_params(self, bin_type="lin", bin_number=1000, bin_range=None):
r"""Set the parameters used to build the Probability Distribution Function
for each ruler length for this function. The values output by the
function are slotted into the bins described here.
Parameters
----------
bin_type : String
Controls the edges of the bins spaced evenly in
logarithmic or linear space, set by "log" or "lin", respectively.
A single string, or list of strings for N-dim binning.
Default = "lin".
bin_number : Integer
Sets how many bins to create, evenly spaced by the above
parameter. A single integer, or a list of integers for N-dim
binning. Default = 1000.
bin_range : Float
A pair of values giving the range for the bins.
A pair of floats (a list), or a list of pairs for N-dim binning.
Default = None.
Examples
--------
>>> f1.set_pdf_params(bin_type='log', bin_range=[5e4, 5.5e13],
... bin_number=1000)
"""
# This should be called after setSearchParams.
if not hasattr(self.tpf, "lengths"):
mylog.error("Please call setSearchParams() before calling setPDFParams().")
return None
# Make sure they're either all lists or only one is.
input = [bin_type, bin_number, bin_range]
lists = 0
for thing in input:
if type(thing) == list:
lists += 1
if lists > 1 and lists < 3:
mylog.error("Either all the inputs need to be lists, or only one.")
return None
# Make sure they're all the same length if they're lists.
if lists == 3:
first_len = 0
for thing in input:
if first_len == 0:
first_len = len(thing)
if first_len == 0:
mylog.error("Input cannot be an empty list.")
return None
continue
if first_len != len(thing):
mylog.error("All the inputs need to have the same length.")
return None
# If they are not all lists, put the input into lists for convenience.
if lists == 1:
bin_type, bin_number = [bin_type], [bin_number]
bin_range = [bin_range]
self.bin_type = bin_type
self.bin_number = np.array(bin_number) - 1
self.dims = range(len(bin_type))
# Create the dict that stores the arrays to store the bin hits, and
# the arrays themselves.
self.length_bin_hits = {}
for length in self.tpf.lengths:
# It's easier to index flattened, but will be unflattened later.
self.length_bin_hits[length] = np.zeros(self.bin_number,
dtype='int64').flatten()
# Create the bin edges for each dimension.
# self.bins is indexed by dimension
self.bin_edges = {}
for dim in self.dims:
# Error check.
if len(bin_range[dim]) != 2:
raise ValueError("bin_range must have two values.")
if bin_range[dim][1] <= bin_range[dim][0]:
raise ValueError("bin_range[1] must be larger than bin_range[0]")
# Make the edges for this dimension.
if bin_type[dim] == "lin":
self.bin_edges[dim] = np.linspace(bin_range[dim][0], bin_range[dim][1],
bin_number[dim])
elif bin_type[dim] == "log":
self.bin_edges[dim] = np.logspace(math.log10(bin_range[dim][0]),
math.log10(bin_range[dim][1]), bin_number[dim])
else:
raise SyntaxError("bin_edges is either \"lin\" or \"log\".")
def _eval_st_fcn(self, results, points, vec):
"""
Return the value of the function using the provided results.
"""
return self.function(results[:,:len(self.tpf.fields)],
results[:,len(self.tpf.fields):], points[:,:3], points[:,3:], vec)
"""
NOTE - A function looks like:
def stuff(a,b,r1,r2, vec):
return [(a[0] - b[0])/(a[1] + b[1])]
where a and b refer to different points in space and the indices
are for the different fields, which are given when the function is
added. The results need to be a list or array even if it's only one
item.
"""
def _bin_results(self, length, results):
"""
Add hits to the bins corresponding to these results. length_hit_bins
is flattened, so we need to figure out the offset for this hit by
factoring the sizes of the other dimensions.
"""
hit_bin = np.zeros(results.shape[0], dtype='int64')
multi = 1
good = np.ones(results.shape[0], dtype='bool')
for dim in range(len(self.out_labels)):
for d1 in range(dim):
multi *= self.bin_edges[d1].size
if dim == 0 and len(self.out_labels)==1:
try:
digi = np.digitize(results, self.bin_edges[dim])
except ValueError:
# The user probably did something like
# return a * b rather than
# return a[0] * b[0], which will only happen
# for single field functions.
digi = np.digitize(results[0], self.bin_edges[dim])
else:
digi = np.digitize(results[:,dim], self.bin_edges[dim])
too_low = (digi == 0)
too_high = (digi == self.bin_edges[dim].size)
self.too_low[dim] += (too_low).sum()
self.too_high[dim] += (too_high).sum()
newgood = np.bitwise_and(np.invert(too_low), np.invert(too_high))
good = np.bitwise_and(good, newgood)
hit_bin += np.multiply((digi - 1), multi)
digi_bins = np.arange(self.length_bin_hits[length].size+1)
hist, digi_bins = np.histogram(hit_bin[good], digi_bins)
self.length_bin_hits[length] += hist
def _dim_sum(self, a, dim):
"""
Given a multidimensional array a, this finds the sum over all the
elements leaving the dimension dim untouched.
"""
dims = np.arange(len(a.shape))
dims = np.flipud(dims)
gt_dims = dims[dims > dim]
lt_dims = dims[dims < dim]
iter_dims = np.concatenate((gt_dims, lt_dims))
for this_dim in iter_dims:
a = a.sum(axis=this_dim)
return a
def _avg_bin_hits(self):
"""
For each dimension and length of bin_hits return the weighted average.
"""
self.length_avgs = defaultdict(dict)
for length in self.tpf.lengths:
for dim in self.dims:
self.length_avgs[length][dim] = \
(self._dim_sum(self.length_bin_hits[length], dim) * \
((self.bin_edges[dim][:-1] + self.bin_edges[dim][1:]) / 2.)).sum()
def _corr_sum_norm(self):
"""
Return the correlations xi for this function. We are tacitly assuming
that all correlation functions are one dimensional.
"""
xi = {}
for length in self.tpf.lengths:
xi[length] = -1 + np.sum(self.length_bin_hits[length] * \
self.bin_edges[0][:-1]) / self.corr_norm
return xi