Source code for floatcsep.utils

# python libraries
import copy
import hashlib

import numpy
import re
import multiprocessing
import os
import mercantile
import shapely.geometry
import scipy.stats
import itertools
import functools
import yaml
import pandas
import seaborn
import filecmp
from datetime import datetime, date
from functools import partial
from typing import Sequence, Union
from matplotlib import pyplot
from matplotlib.lines import Line2D
from collections import OrderedDict

# pyCSEP libraries
import six
import csep.core
import csep.utils
from csep.core.regions import CartesianGrid2D, compute_vertices
from csep.utils.plots import plot_spatial_dataset
from csep.models import Polygon
from csep.core.regions import QuadtreeGrid2D, geographical_area_from_bounds
from csep.utils.calc import cleaner_range

# floatCSEP libraries

import floatcsep.accessors
import floatcsep.extras
import floatcsep.readers

_UNITS = ['years', 'months', 'weeks', 'days']
_PD_FORMAT = ['YS', 'MS', 'W', 'D']


[docs] def parse_csep_func(func): """ Searchs in pyCSEP and floatCSEP a function or method whose name matches the provided string. Args: func (str, obj) : representing the name of the pycsep/floatcsep function or method Returns: The callable function/method object. If it was already callable, returns the same input """ def recgetattr(obj, attr, *args): def _getattr(obj_, attr_): return getattr(obj_, attr_, *args) return functools.reduce(_getattr, [obj] + attr.split('.')) if callable(func): return func elif func is None: return func else: _target_modules = [csep, csep.utils, csep.utils.plots, csep.core.regions, floatcsep.utils, floatcsep.accessors, floatcsep.extras, floatcsep.readers.HDF5Serializer, floatcsep.readers.ForecastParsers] for module in _target_modules: try: return recgetattr(module, func) except AttributeError: pass raise AttributeError( f'Evaluation/Plot/Region function {func} has not yet been' f' implemented in floatcsep or pycsep')
[docs] def parse_timedelta_string(window, exp_class='ti'): """ Parses a float or string representing the testing time window length Note: Time-independent experiments defaults to `year` as time unit whereas time-dependent to `days` Args: window (str, int): length of the time window exp_class (str): experiment class Returns: Formatted :py:class:`str` representing the length and unit (year, month, week, day) of the time window """ if isinstance(window, str): try: n, unit_ = [i for i in re.split(r'(\d+)', window) if i] unit = [i for i in [j[:-1] for j in _UNITS] if i in unit_.lower()][ 0] return f'{n}-{unit}s' except (ValueError, IndexError): raise ValueError('Time window is misspecified. ' 'Try the amount followed by the time unit ' '(e.g. 1 day, 1 months, 3 years)') elif isinstance(window, float): n = window unit = 'year' if exp_class == 'ti' else 'day' return f'{n}-{unit}s'
def read_time_cfg(time_config, **kwargs): """ Builds the temporal configuration of an experiment. Args: time_config (dict): Dictionary containing the explicit temporal attributes of the experiment (see `_attrs` local variable) **kwargs: Only the keywords contained in the local variable `_attrs` are captured. This ensures to capture the keywords passed to an :class:`~floatcsep.core.Experiment` object Returns: A dictionary containing the experiment time attributes and the time windows to be evaluated """ _attrs = ['start_date', 'end_date', 'intervals', 'horizon', 'offset', 'growth', 'exp_class'] time_config = copy.deepcopy(time_config) if time_config is None: time_config = {} try: experiment_class = time_config.get('exp_class', kwargs['exp_class']) except KeyError: experiment_class = 'ti' time_config['exp_class'] = experiment_class time_config.update({i: j for i, j in kwargs.items() if i in _attrs}) if 'horizon' in time_config.keys(): time_config['horizon'] = parse_timedelta_string(time_config['horizon']) if 'offset' in time_config.keys(): time_config['offset'] = parse_timedelta_string(time_config['offset']) if experiment_class == 'ti': time_config['timewindows'] = timewindows_ti(**time_config) return time_config elif experiment_class == 'td': time_config['timewindows'] = timewindows_td(**time_config) return time_config def read_region_cfg(region_config, **kwargs): """ Builds the region configuration of an experiment. Args: region_config (dict): Dictionary containing the explicit region attributes of the experiment (see `_attrs` local variable) **kwargs: Only the keywords contained in the local variable `_attrs` are captured. This ensures to capture the keywords passed to an :class:`~floatcsep.core.Experiment` object Returns: A dictionary containing the region attributes of the experiment """ region_config = copy.deepcopy(region_config) _attrs = ['region', 'mag_min', 'mag_max', 'mag_bin', 'magnitudes', 'depth_min', 'depth_max'] if region_config is None: region_config = {} region_config.update({i: j for i, j in kwargs.items() if i in _attrs}) dmin = region_config.get('depth_min', -2) dmax = region_config.get('depth_max', 6000) depths = cleaner_range(dmin, dmax, dmax - dmin) magnitudes = region_config.get('magnitudes', None) if magnitudes is None: magmin = region_config['mag_min'] magmax = region_config['mag_max'] magbin = region_config['mag_bin'] magnitudes = cleaner_range(magmin, magmax, magbin) region_data = region_config.get('region', None) try: region = parse_csep_func(region_data)(name=region_data, magnitudes=magnitudes) \ if region_data else None except AttributeError: if isinstance(region_data, str): filename = os.path.join(kwargs.get('path', ''), region_data) with open(filename, 'r') as file_: parsed_region = file_.readlines() try: data = numpy.array([re.split(r'\s+|,', i.strip()) for i in parsed_region], dtype=float) except ValueError: data = numpy.array([re.split(r'\s+|,', i.strip()) for i in parsed_region[1:]], dtype=float) dh1 = scipy.stats.mode( numpy.diff(numpy.unique(data[:, 0]))).mode dh2 = scipy.stats.mode( numpy.diff(numpy.unique(data[:, 1]))).mode dh = numpy.nanmin([dh1, dh2]) region = CartesianGrid2D.from_origins(data, name=region_data, magnitudes=magnitudes, dh=dh) region_config.update({'path': region_data}) else: region_data['magnitudes'] = magnitudes region = CartesianGrid2D.from_dict(region_data) region_config.update({'depths': depths, 'magnitudes': magnitudes, 'region': region}) return region_config
[docs] def timewindow2str(datetimes: Union[Sequence[datetime], Sequence[Sequence[datetime]]]): """ Converts a time window (list/tuple of datetimes) to a string that represents it. Can be a single timewindow or a list of time windows. Args: datetimes: Returns: """ if isinstance(datetimes[0], datetime): return '_'.join([j.date().isoformat() for j in datetimes]) elif isinstance(datetimes[0], (list, tuple)): return ['_'.join([j.date().isoformat() for j in i]) for i in datetimes]
def str2timewindow(tw_string: Union[str, Sequence[str]]): """ Converts a string representation of a time window into a list of datetimes representing the time window edges. Args: tw_string: Returns: """ if isinstance(tw_string, str): start_date, end_date = [datetime.fromisoformat(i) for i in tw_string.split('_')] return start_date, end_date elif isinstance(tw_string, (list, tuple)): datetimes = [] for twstr in tw_string: start_date, end_date = [datetime.fromisoformat(i) for i in twstr.split('_')] datetimes.append([start_date, end_date]) return datetimes
[docs] def timewindows_ti(start_date=None, end_date=None, intervals=None, horizon=None, growth='incremental', **_): """ Creates the testing intervals for a time-independent experiment. Note: The following argument combinations are possible: - (start_date, end_date) - (start_date, end_date, timeintervals) - (start_date, end_date, timehorizon) - (start_date, timeintervals, timehorizon) Args: start_date (datetime.datetime): Start of the experiment end_date (datetime.datetime): End of the experiment intervals (int): number of intervals to discretize the time span horizon (str): time length of each interval growth (str): incremental or cumulative time windows Returns: List of tuples containing the lower and upper boundaries of each testing window, as :py:class:`datetime.datetime`. """ frequency = None if (intervals is None) and (horizon is None): intervals = 1 elif horizon: n, unit = horizon.split('-') frequency = f'{n}{_PD_FORMAT[_UNITS.index(unit)]}' periods = intervals + 1 if intervals else intervals try: timelimits = pandas.date_range(start=start_date, end=end_date, periods=periods, freq=frequency).to_pydatetime() except ValueError as e_: raise ValueError( 'The following experiment parameters combinations are possible:\n' ' (start_date, end_date)\n' ' (start_date, end_date, intervals)\n' ' (start_date, end_date, timewindow)\n' ' (start_date, intervals, timewindow)\n') if growth == 'incremental': return [(i, j) for i, j in zip(timelimits[:-1], timelimits[1:])] elif growth == 'cumulative': return [(timelimits[0], i) for i in timelimits[1:]]
[docs] def timewindows_td(start_date=None, end_date=None, timeintervals=None, timehorizon=None, timeoffset=None, **_): """ Creates the testing intervals for a time-dependent experiment. Note: The following arg combinations are possible: - (start_date, end_date, timeintervals) - (start_date, end_date, timehorizon) - (start_date, timeintervals, timehorizon) - (start_date, end_date, timehorizon, timeoffset) - (start_date, timeinvervals, timehorizon, timeoffset) Args: start_date (datetime.datetime): Start of the experiment end_date (datetime.datetime): End of the experiment timeintervals (int): number of intervals to discretize the time span timehorizon (str): time length of each time window timeoffset (str): Offset between consecutive forecast. if None or timeoffset=timehorizon, windows are non-overlapping Returns: List of tuples containing the lower and upper boundaries of each testing window, as :py:class:`datetime.datetime`. """ frequency = None if timehorizon: n, unit = timehorizon.split('-') frequency = f'{n}{_PD_FORMAT[_UNITS.index(unit)]}' periods = timeintervals + 1 if timeintervals else timeintervals try: offset = timeoffset.split('-') if timeoffset else None start_offset = start_date + pandas.DateOffset( **{offset[1]: float(offset[0])}) if offset else start_date end_offset = end_date - pandas.DateOffset( **{offset[1]: float(offset[0])}) if offset else start_date lower_limits = pandas.date_range(start=start_date, end=end_offset, periods=periods, freq=frequency).to_pydatetime()[:-1] upper_limits = pandas.date_range(start=start_offset, end=end_date, periods=periods, freq=frequency).to_pydatetime()[:-1] except ValueError as e_: raise ValueError( 'The following experiment parameters combinations are possible:\n' ' (start_date, end_date)\n' ' (start_date, end_date, intervals)\n' ' (start_date, end_date, timewindow)\n' ' (start_date, intervals, timewindow)\n')
# if growth == 'incremental': # timewindows = [(i, j) for i, j in zip(timelimits[:-1], # timelimits[1:])] # elif growth == 'cumulative': # timewindows = [(timelimits[0], i) for i in timelimits[1:]] # return timewindows
[docs] class Task:
[docs] def __init__(self, instance, method, **kwargs): """ Base node of the workload distribution. Wraps lazily objects, methods and their arguments for them to be executed later. For instance, can wrap a floatcsep.Model, its method 'create_forecast' and the argument 'time_window', which can be executed later with Task.call() when, for example, task dependencies (parent nodes) have been completed. Args: instance: can be floatcsep.Experiment, floatcsep.Model, floatcsep.Evaluation method: the instance's method to be lazily created **kwargs: keyword arguments passed to method. """ self.obj = instance self.method = method self.kwargs = kwargs self.store = None # Bool for nested tasks. DEPRECATED
def sign_match(self, obj=None, met=None, kw_arg=None): """ Checks if the Task matchs a given signature for simplicity. Purpose is to check from the outside if the Task is from a given object (Model, Experiment, etc), matching either name or object or description Args: obj: Instance or instance's name str. Instance is preferred met: Name of the method kw_arg: Only the value (not key) of the kwargs dictionary Returns: """ if self.obj == obj or obj == getattr(self.obj, 'name', None): if met == self.method: if kw_arg in self.kwargs.values(): return True return False def __str__(self): task_str = f'{self.__class__}\n\t' \ f'Instance: {self.obj.__class__.__name__}\n' a = getattr(self.obj, 'name', None) if a: task_str += f'\tName: {a}\n' task_str += f'\tMethod: {self.method}\n' for i, j in self.kwargs.items(): task_str += f'\t\t{i}: {j} \n' return task_str[:-2]
[docs] def run(self): if hasattr(self.obj, 'store'): self.obj = self.obj.store output = getattr(self.obj, self.method)(**self.kwargs) if output: self.store = output del self.obj return output
def __call__(self, *args, **kwargs): return self.run()
[docs] def check_exist(self): pass
class TaskGraph: """ Context manager of floatcsep workload distribution Assign tasks to a node and defines their dependencies (parent nodes). Contains a 'tasks' dictionary whose dict_keys are the Task to be executed with dict_values as the Task's dependencies. """ def __init__(self): self.tasks = OrderedDict() self._ntasks = 0 self.name = 'floatcsep.utils.TaskGraph' @property def ntasks(self): return self._ntasks @ntasks.setter def ntasks(self, n): self._ntasks = n def add(self, task): """ Simply adds a defined task to the graph Args: task: floatcsep.utils.Task Returns: """ self.tasks[task] = [] self.ntasks += 1 def add_dependency(self, task, dinst=None, dmeth=None, dkw=None): """ Adds a dependency to a task already inserted to the TaskGraph. Searchs within the pre-added tasks a signature match by their name/instance, method and keyword_args. Args: task: Task to which a dependency will be asigned dinst: object/name of the dependency dmeth: method of the dependency dkw: keyword argument of the dependency Returns: """ deps = [] for i, other_tasks in enumerate(self.tasks.keys()): if other_tasks.sign_match(dinst, dmeth, dkw): deps.append(other_tasks) self.tasks[task].extend(deps) def run(self): """ Iterates through all the graph tasks and runs them. Returns: """ for task, deps in self.tasks.items(): task.run() def __call__(self, *args, **kwargs): return self.run() def check_exist(self): pass class MarkdownReport: """ Class to generate a Markdown report from a study """ def __init__(self, outname='report.md'): self.outname = outname self.toc = [] self.has_title = True self.has_introduction = False self.markdown = [] def add_introduction(self, adict): """ Generate document header from dictionary """ first = f"# CSEP Testing Results: {adict['simulation_name']} \n" \ f"**Forecast Name:** {adict['forecast_name']} \n" \ f"**Simulation Start Time:** {adict['origin_time']} \n" \ f"**Evaluation Time:** {adict['evaluation_time']} \n" \ f"**Catalog Source:** {adict['catalog_source']} \n" \ f"**Number Simulations:** {adict['num_simulations']}\n" # used to determine to place TOC at beginning of document or after # introduction. self.has_introduction = True self.markdown.append(first) return first def add_text(self, text): """ Text should be a list of strings where each string will be on its own line. Each add_text command represents a paragraph. Args: text (list): lines to write Returns: """ self.markdown.append(' '.join(text) + '\n\n') def add_figure(self, title, relative_filepaths, level=2, ncols=1, add_ext=False, text='', caption='', width=None): """ This function expects a list of filepaths. If you want the output stacked, select a value of ncols. ncols should be divisible by filepaths. todo: modify formatted_paths to work when not divis. Args: title: name of the figure level (int): value 1-6 depending on the heading relative_filepaths (str or List[Tuple[str]]): list of paths in order to make table Returns: """ # verify filepaths have proper extension should always be png is_single = False paths = [] if isinstance(relative_filepaths, six.string_types): is_single = True paths.append(relative_filepaths) else: paths = relative_filepaths correct_paths = [] if add_ext: for fp in paths: correct_paths.append(fp + '.png') else: correct_paths = paths # generate new lists with size ncols formatted_paths = [correct_paths[i:i + ncols] for i in range(0, len(paths), ncols)] # convert str into a list, where each potential row is an iter not str def build_header(_row): top = "|" bottom = "|" for i, _ in enumerate(_row): if i == ncols: break top += " |" bottom += " --- |" return top + '\n' + bottom size_ = bool(width) * f'width={width}' def add_to_row(_row): if len(_row) == 1: return f'<img src="{_row[0]}" {size_}/>' string = '| ' for item in _row: string = string + f'<img src="{item}" width={width}/>' return string level_string = f"{level * '#'}" result_cell = [] locator = title.lower().replace(" ", "_") result_cell.append( f'{level_string} {title} <a name="{locator}"></a>\n') result_cell.append(f'{text}\n') for i, row in enumerate(formatted_paths): if i == 0 and not is_single and ncols > 1: result_cell.append(build_header(row)) result_cell.append(add_to_row(row)) result_cell.append('\n') result_cell.append(f'{caption}') self.markdown.append('\n'.join(result_cell) + '\n') # generate metadata for TOC self.toc.append((title, level, locator)) def add_heading(self, title, level=1, text='', add_toc=True): # multipying char simply repeats it if isinstance(text, str): text = [text] cell = [] level_string = f"{level * '#'}" locator = title.lower().replace(" ", "_") sub_heading = f'{level_string} {title} <a name="{locator}"></a>\n' cell.append(sub_heading) try: for item in list(text): cell.append(item) except Exception as ex: raise RuntimeWarning( "Unable to add document subhead, text must be iterable.") self.markdown.append('\n'.join(cell) + '\n') # generate metadata for TOC if add_toc: self.toc.append((title, level, locator)) def add_list(self, _list): cell = [] for item in _list: cell.append(f"* {item}") self.markdown.append('\n'.join(cell) + '\n\n') def add_title(self, title, text): self.has_title = True self.add_heading(title, 1, text, add_toc=False) def table_of_contents(self): """ generates table of contents based on contents of document. """ if len(self.toc) == 0: return toc = ["# Table of Contents"] for i, elem in enumerate(self.toc): title, level, locator = elem space = ' ' * (level - 1) toc.append(f"{space}1. [{title}](#{locator})") insert_loc = 1 if self.has_title else 0 self.markdown.insert(insert_loc, '\n'.join(toc) + '\n\n') def add_table(self, data, use_header=True): """ Generates table from HTML and styles using bootstrap class Args: data List[Tuple[str]]: should be (nrows, ncols) in size. all rows should be the same sizes Returns: table (str): this can be added to subheading or other cell if desired. """ table = ['<div class="table table-striped">', f'<table>'] def make_header(row): header = [] header.append('<tr>') for item in row: header.append(f'<th>{item}</th>') header.append('</tr>') return '\n'.join(header) def add_row(row): table_row = ['<tr>'] for item in row: table_row.append(f"<td>{item}</td>") table_row.append('</tr>') return '\n'.join(table_row) for i, row in enumerate(data): if i == 0 and use_header: table.append(make_header(row)) else: table.append(add_row(row)) table.append('</table>') table.append('</div>') table = '\n'.join(table) self.markdown.append(table + '\n\n') def save(self, save_dir): output = list(itertools.chain.from_iterable(self.markdown)) full_md_fname = os.path.join(save_dir, self.outname) with open(full_md_fname, 'w') as f: f.writelines(output) class NoAliasLoader(yaml.Loader): @staticmethod def ignore_aliases(self): return True class ExperimentComparison: def __init__(self, original, reproduced, **kwargs): """ """ self.original = original self.reproduced = reproduced self.num_results = {} self.file_comp = {} @staticmethod def obs_diff(obs_orig, obs_repr): return numpy.abs(numpy.divide((numpy.array(obs_orig) - numpy.array(obs_repr)), numpy.array(obs_orig))) @staticmethod def test_stat(test_orig, test_repr): if isinstance(test_orig[0], str): if not isinstance(test_orig[1], str): stats = numpy.array([0, numpy.divide((test_repr[1] - test_orig[1]), test_orig[1]), 0, 0]) else: stats = None else: stats_orig = numpy.array([numpy.mean(test_orig), numpy.std(test_orig), scipy.stats.skew(test_orig)]) stats_repr = numpy.array([numpy.mean(test_repr), numpy.std(test_repr), scipy.stats.skew(test_repr)]) ks = scipy.stats.ks_2samp(test_orig, test_repr) stats = [*numpy.divide( numpy.abs(stats_repr - stats_orig), stats_orig ), ks.pvalue] return stats def get_results(self): win_orig = timewindow2str(self.original.timewindows) win_repr = timewindow2str(self.reproduced.timewindows) tests_orig = self.original.tests tests_repr = self.reproduced.tests models_orig = [i.name for i in self.original.models] models_repr = [i.name for i in self.reproduced.models] results = dict.fromkeys([i.name for i in tests_orig]) for test in tests_orig: if test.type in ['consistency', 'comparative']: results[test.name] = dict.fromkeys(win_orig) for tw in win_orig: results_orig = self.original.read_results(test, tw) results_repr = self.reproduced.read_results(test, tw) results[test.name][tw] = { models_orig[i]: { 'observed_statistic': self.obs_diff( results_orig[i].observed_statistic, results_repr[i].observed_statistic), 'test_statistic': self.test_stat( results_orig[i].test_distribution, results_repr[i].test_distribution) } for i in range(len(models_orig))} else: results_orig = self.original.read_results(test, win_orig[-1]) results_repr = self.reproduced.read_results(test, win_orig[-1]) results[test.name] = { models_orig[i]: { 'observed_statistic': self.obs_diff( results_orig[i].observed_statistic, results_repr[i].observed_statistic), 'test_statistic': self.test_stat( results_orig[i].test_distribution, results_repr[i].test_distribution) } for i in range(len(models_orig))} return results @staticmethod def get_hash(filename): with open(filename, "rb") as f: bytes_file = f.read() readable_hash = hashlib.sha256(bytes_file).hexdigest() return readable_hash def get_filecomp(self): win_orig = timewindow2str(self.original.timewindows) win_repr = timewindow2str(self.reproduced.timewindows) tests_orig = self.original.tests tests_repr = self.reproduced.tests models_orig = [i.name for i in self.original.models] models_repr = [i.name for i in self.reproduced.models] results = dict.fromkeys([i.name for i in tests_orig]) for test in tests_orig: if test.type in ['consistency', 'comparative']: results[test.name] = dict.fromkeys(win_orig) for tw in win_orig: results[test.name][tw] = dict.fromkeys(models_orig) for model in models_orig: orig_path = self.original.path( tw, 'evaluations', test, model) repr_path = self.reproduced.path( tw, 'evaluations', test, model) results[test.name][tw][model] ={ 'hash': ( self.get_hash(orig_path) == self.get_hash(repr_path)), 'byte2byte': filecmp.cmp(orig_path, repr_path)} else: results[test.name] = dict.fromkeys(models_orig) for model in models_orig: orig_path = self.original.path( win_orig[-1], 'evaluations', test, model) repr_path = self.reproduced.path( win_orig[-1], 'evaluations', test, model) results[test.name][model] ={ 'hash': ( self.get_hash(orig_path) == self.get_hash(repr_path)), 'byte2byte': filecmp.cmp(orig_path, repr_path)} return results def compare_results(self): self.num_results = self.get_results() self.file_comp = self.get_filecomp() self.write_report() def write_report(self): numerical = self.num_results data = self.file_comp outname = os.path.join('reproducibility_report.md') save_path = os.path.dirname(os.path.join(self.reproduced.path.workdir, self.reproduced.path.rundir)) report = MarkdownReport(outname=outname) report.add_title( f"Reproducibility Report - {self.original.name}", '' ) report.add_heading("Objectives", level=2) objs = [ "Analyze the statistic reproducibility and data reproducibility of" " the experiment. Compares the differences between " "(i) the original and reproduced scores," " (ii) the statistical descriptors of the test distributions," " (iii) The p-value of a Kolmogorov-Smirnov test -" " values beneath 0.1 means we can't reject the distributions are" " similar -," " (iv) Hash (SHA-256) comparison between the results' files and " "(v) byte-to-byte comparison" ] report.add_list(objs) for num, dat in zip(numerical.items(), data.items()): res_keys = list(num[1].keys()) is_time = False try: str2timewindow(res_keys[0]) is_time = True except ValueError: pass if is_time: report.add_heading(num[0], level=2) for tw in res_keys: rows = [[tw, 'Score difference', 'Test Mean diff.', 'Test Std diff.', 'Test Skew diff.', 'KS-test p value', 'Hash (SHA-256) equal', 'Byte-to-byte equal']] for model_stat, model_file in zip(num[1][tw].items(), dat[1][tw].items()): obs = model_stat[1]['observed_statistic'] test = model_stat[1]['test_statistic'] rows.append([model_stat[0], obs, *[f'{i:.1e}' for i in test[:-1]], f'{test[-1]:.1e}', model_file[1]['hash'], model_file[1]['byte2byte']]) report.add_table(rows) else: report.add_heading(num[0], level=2) rows = [[tw, 'Max Score difference', 'Hash (SHA-256) equal', 'Byte-to-byte equal']] for model_stat, model_file in zip(num[1].items(), dat[1].items()): obs = numpy.nanmax(model_stat[1]['observed_statistic']) rows.append([model_stat[0], f'{obs:.1e}', model_file[1]['hash'], model_file[1]['byte2byte']]) report.add_table(rows) report.table_of_contents() report.save(save_path) ####################### # Perhaps add to pycsep #######################
[docs] def plot_sequential_likelihood(evaluation_results, plot_args=None): if plot_args is None: plot_args = {} title = plot_args.get('title', None) titlesize = plot_args.get('titlesize', None) ylabel = plot_args.get('ylabel', None) colors = plot_args.get('colors', [None] * len(evaluation_results)) linestyles = plot_args.get('linestyles', [None] * len(evaluation_results)) markers = plot_args.get('markers', [None] * len(evaluation_results)) markersize = plot_args.get('markersize', 1) linewidth = plot_args.get('linewidth', 0.5) figsize = plot_args.get('figsize', (6, 4)) timestrs = plot_args.get('timestrs', None) if timestrs: startyear = [date.fromisoformat(j.split('_')[0]) for j in timestrs][0] endyears = [date.fromisoformat(j.split('_')[1]) for j in timestrs] years = [startyear] + endyears else: startyear = 0 years = numpy.arange(0, len(evaluation_results[0].observed_statistic) + 1) seaborn.set_style("white", {"axes.facecolor": ".9", 'font.family': 'Ubuntu'}) pyplot.rcParams.update({'xtick.bottom': True, 'axes.labelweight': 'bold', 'xtick.labelsize': 8, 'ytick.labelsize': 8, 'legend.fontsize': 9}) if isinstance(colors, list): assert len(colors) == len(evaluation_results) elif isinstance(colors, str): colors = [colors] * len(evaluation_results) if isinstance(linestyles, list): assert len(linestyles) == len(evaluation_results) elif isinstance(linestyles, str): linestyles = [linestyles] * len(evaluation_results) if isinstance(markers, list): assert len(markers) == len(evaluation_results) elif isinstance(markers, str): markers = [markers] * len(evaluation_results) fig, ax = pyplot.subplots(figsize=figsize) for i, result in enumerate(evaluation_results): data = [0] + result.observed_statistic ax.plot(years, data, color=colors[i], linewidth=linewidth, linestyle=linestyles[i], marker=markers[i], markersize=markersize, label=result.sim_name) ax.set_ylabel(ylabel) ax.set_xlim([startyear, None]) ax.set_title(title, fontsize=titlesize) ax.grid(True) ax.legend(loc=(1.04, 0), fontsize=7) fig.tight_layout()
[docs] def magnitude_vs_time(catalog): mag = catalog.data['magnitude'] time = [datetime.fromtimestamp(i / 1000.) for i in catalog.data['origin_time']] fig, ax = pyplot.subplots(figsize=(12, 4)) ax.plot(time, mag, marker='o', linewidth=0, color='r', alpha=0.2) ax.set_xlabel('Date', fontsize=16) ax.set_ylabel('$M_w$', fontsize=16) ax.set_title('Magnitude vs. Time', fontsize=18) return ax
def plot_matrix_comparative_test(evaluation_results, p=0.05, order=True, plot_args={}): """ Produces matrix plot for comparative tests for all models Args: evaluation_results (list of result objects): paired t-test results p (float): significance level order (bool): columns/rows ordered by ranking Returns: ax (matplotlib.Axes): handle for figure """ names = [i.sim_name for i in evaluation_results] t_value = numpy.array( [Tw_i.observed_statistic for Tw_i in evaluation_results]) t_quantile = numpy.array( [Tw_i.quantile[0] for Tw_i in evaluation_results]) w_quantile = numpy.array( [Tw_i.quantile[1] for Tw_i in evaluation_results]) score = numpy.sum(t_value, axis=1) / t_value.shape[0] if order: arg_ind = numpy.flip(numpy.argsort(score)) else: arg_ind = numpy.arange(len(score)) # Flip rows/cols if ordered by value data_t = t_value[arg_ind, :][:, arg_ind] data_w = w_quantile[arg_ind, :][:, arg_ind] data_tq = t_quantile[arg_ind, :][:, arg_ind] fig, ax = pyplot.subplots(1, 1, figsize=(7, 6)) cmap = seaborn.diverging_palette(220, 20, as_cmap=True) seaborn.heatmap(data_t, vmin=-3, vmax=3, center=0, cmap=cmap, ax=ax, cbar_kws={'pad': 0.01, 'shrink': 0.7, 'label': 'Information Gain', 'anchor': (0., 0.)}), ax.set_yticklabels([names[i] for i in arg_ind], rotation='horizontal') ax.set_xticklabels([names[i] for i in arg_ind], rotation='vertical') for n, i in enumerate(data_tq): for m, j in enumerate(i): if j > 0 and data_w[n, m] < p: ax.scatter(n + 0.5, m + 0.5, marker='o', s=5, color='black') legend_elements = [Line2D([0], [0], marker='o', lw=0, label=r'T and W significant', markerfacecolor="black", markeredgecolor='black', markersize=4)] fig.legend(handles=legend_elements, loc='lower right', bbox_to_anchor=(0.75, 0.0, 0.2, 0.2), handletextpad=0) pyplot.tight_layout() ######################### # Below needs refactoring ######################### def forecast_mapping(forecast_gridded, target_grid, ncpu=None): """ Aggregates conventional forecast onto quadtree region This is generic function, which can map any forecast on to another grid. Wrapper function over "_forecat_mapping_generic" Forecast mapping onto Target Grid forecast_gridded: csep.core.forecast with other grid. target_grid: csep.core.region.CastesianGrid2D or QuadtreeGrid2D only_de-aggregate: Flag (True or False) Note: set the flag "only_deagregate = True" Only if one is sure that both grids are Quadtree and Target grid is high-resolution at every level than the other grid. """ from csep.core.forecasts import GriddedForecast bounds_target = target_grid.bounds bounds = forecast_gridded.region.bounds data = forecast_gridded.data data_mapped_bounds = _forecast_mapping_generic(bounds_target, bounds, data, ncpu=ncpu) target_forecast = GriddedForecast(data=data_mapped_bounds, region=target_grid, magnitudes=forecast_gridded.magnitudes) return target_forecast def plot_quadtree_forecast(qtree_forecast): """ Currently, only a single-resolution plotting capability is available. So we aggregate multi-resolution forecast on a single-resolution grid and then plot it Args: csep.core.models.GriddedForecast Returns: class:`matplotlib.pyplot.ax` object """ quadkeys = qtree_forecast.region.quadkeys qk_sizes = [] for qk in quadkeys: qk_sizes.append(len(qk)) if qk_sizes.count(qk_sizes[0]) == len(qk_sizes): # single-resolution grid ax = qtree_forecast.plot() else: print('Multi-resolution grid detected.') print('Currently, we do not offer utility to plot a forecast with ' 'multi-resolution grid') print('Therefore, forecast is being aggregated on a single-resolution ' 'grid (L8) for plotting') single_res_grid_l8 = QuadtreeGrid2D.from_single_resolution(8) forecast_l8 = forecast_mapping(qtree_forecast, single_res_grid_l8) ax = forecast_l8.plot() return ax def plot_forecast_lowres(forecast, plot_args, k=4): """ Plot a reduced resolution plot. The forecast values are kept the same, but cells are enlarged :param forecast: GriddedForecast object :param plot_args: arguments to be passed to plot_spatial_dataset :param k: Resampling factor. Selects cells every k row and k columns. """ print('\tPlotting Forecast') plot_args['title'] = forecast.name region = forecast.region coords = region.origins() dataset = numpy.log10(forecast.spatial_counts(cartesian=True))[::k, ::k] region.xs = numpy.unique(region.get_cartesian(coords[:, 0])[0, ::k]) region.ys = numpy.unique(region.get_cartesian(coords[:, 1])[::k, 0]) plot_spatial_dataset(dataset, region, set_global=True, plot_args=plot_args) def quadtree_csv_loader(csv_fname): """ Load quadtree forecasted stored as csv file The format expects forecast as a comma separated file, in which first column corresponds to quadtree grid cell (quadkey). The second and thrid columns indicate depth range. The corresponding enteries in the respective row are forecast rates corresponding to the magnitude bins. The first line of forecast is a header, and its format is listed here: 'Quadkey', depth_min, depth_max, Mag_0, Mag_1, Mag_2, Mag_3 , .... Quadkey is a string. Rest of the values are floats. For the purposes of defining region objects quadkey is used. We assume that the starting value of magnitude bins are provided in the header. Args: csv_fname: file name of csep forecast in csv format Returns: rates, region, mws (np.ndarray, QuadtreeRegion2D, np.ndarray): rates, region, and magnitude bins needed to define QuadTree models """ data = numpy.genfromtxt(csv_fname, dtype='str', delimiter=',') quadkeys = data[1:, 0] mws = data[0, 3:].astype(float) rates = data[1:, 3:] rates = rates.astype(float) region = QuadtreeGrid2D.from_quadkeys(quadkeys, magnitudes=mws) region.get_cell_area() return rates, region, mws def geographical_area_from_qk(quadk): """ Wrapper around function geographical_area_from_bounds """ bounds = tile_bounds(quadk) return geographical_area_from_bounds(bounds[0], bounds[1], bounds[2], bounds[3]) def tile_bounds(quad_cell_id): """ It takes in a single Quadkkey and returns lat,longs of two diagonal corners using mercantile Parameters ---------- quad_cell_id : Stirng Quad key of a cell. Returns ------- bounds : Mercantile object Latitude and Longitude of bottom left AND top right corners. """ bounds = mercantile.bounds(mercantile.quadkey_to_tile(quad_cell_id)) return [bounds.west, bounds.south, bounds.east, bounds.north] def create_polygon(fg): """ Required for parallel processing """ return shapely.geometry.Polygon( [(fg[0], fg[1]), (fg[2], fg[1]), (fg[2], fg[3]), (fg[0], fg[3])]) def calc_cell_area(cell): """ Required for parallel processing """ return geographical_area_from_bounds(cell[0], cell[1], cell[2], cell[3]) def _map_overlapping_cells(fcst_grid_poly, fcst_cell_area, fcst_rate_poly, target_poly): # , """ This functions work for Cells that do not directly conside with target polygon cells. This function uses 3 variables i.e. fcst_grid_poly, fcst_cell_area, fcst_rate_poly This function takes 1 target polygon, upon which models are to be mapped. Finds all the cells of forecast grid that match with this polygon and then maps the forecast rate of those cells according to area. fcst_grid_polygon (variable in memory): The grid that needs to be mapped on target_poly fcst_rate_poly (variable in memory): The forecast that needs to be mapped on target grid polygon fcst_cell_area (variable in memory): The cell area of forecast grid Args: target_poly: One polygon upon which forecast grid is to be mapped. returns: The forecast rate received by target_poly """ map_rate = numpy.array([0]) for j in range(len(fcst_grid_poly)): # Iterates over ALL the cells of Forecast grid and find the cells # that overlap with target cell (poly). if target_poly.intersects(fcst_grid_poly[j]): # overlaps intersect = target_poly.intersection(fcst_grid_poly[j]) shared_area = geographical_area_from_bounds(intersect.bounds[0], intersect.bounds[1], intersect.bounds[2], intersect.bounds[3]) map_rate = map_rate + ( fcst_rate_poly[j] * (shared_area / fcst_cell_area[j])) return map_rate def _map_exact_inside_cells(fcst_grid, fcst_rate, boundary): """ Uses 2 Global variables. fcst_grid, fcst_rate Takes a cell_boundary and finds all those fcst_grid cells that fit exactly inside it and then sum-up the rates of all those cells fitting inside it to get forecast rate for boundary_cell Args: boundary: 1 cell with [lon1, lat1, lon2, lat2] returns: 1 - sum of forecast_rates for cell that fall totally inside of boundary cell 2 - Array of the corresponding cells that fall inside """ c = numpy.logical_and(numpy.logical_and(fcst_grid[:, 0] >= boundary[0], fcst_grid[:, 1] >= boundary[1]), numpy.logical_and(fcst_grid[:, 2] <= boundary[2], fcst_grid[:, 3] <= boundary[3])) exact_cells = numpy.where(c == True) return numpy.sum(fcst_rate[c], axis=0), exact_cells def _forecast_mapping_generic(target_grid, fcst_grid, fcst_rate, ncpu=None): """ This function can perofrmns both aggregation and de-aggregation/ It is a wrapper function that uses 4 functions in respective order i.e. _map_exact_cells, _map_overlapping_cells, calc_cell_area, create_polygon Maps the forecast rates of one grid to another grid using parallel processing Works in two steps: 1 - Maps all those cells that fall entirely on target cells 2 - The cells that overlap with multiple cells, map them according to cell area Inumpyuts: target_grid: Target grid bounds, upon which forecast is to be mapped. [n x 4] array, Bottom left and Top Right corners [lon1, lat1, lon2, lat2] fcst_grid: Available grid that is available with forecast Same as bounds_targets fcst_rate: Forecast rates to be mapped. [n x mbins] Returns: target_rates: Forecast rates mapped on the target grid [nx1] """ if ncpu == None: ncpu = multiprocessing.cpu_count() pool = multiprocessing.Pool(ncpu) else: pool = multiprocessing.Pool(ncpu) # mp.cpu_count() print('Number of CPUs :', ncpu) func_exact = partial(_map_exact_inside_cells, fcst_grid, fcst_rate) exact_rate = pool.map(func_exact, [poly for poly in target_grid]) pool.close() exact_cells = [] exact_rate_tgt = [] for i in range(len(exact_rate)): exact_cells.append(exact_rate[i][1][0]) exact_rate_tgt.append(exact_rate[i][0]) exact_cells = numpy.concatenate(exact_cells) # Exclude all those cells from Grid that have already fallen entirely # inside any cell of Target Grid fcst_rate_poly = numpy.delete(fcst_rate, exact_cells, axis=0) lft_fcst_grid = numpy.delete(fcst_grid, exact_cells, axis=0) # play now only with those cells are overlapping with multiple target cells # Get the polygon of Remaining Forecast grid Cells pool = multiprocessing.Pool(ncpu) fcst_grid_poly = pool.map(create_polygon, [i for i in lft_fcst_grid]) pool.close() # Get the Cell Area of forecast grid pool = multiprocessing.Pool(ncpu) fcst_cell_area = pool.map(calc_cell_area, [i for i in lft_fcst_grid]) pool.close() # print('Calculate target polygons') pool = multiprocessing.Pool(ncpu) target_grid_poly = pool.map(create_polygon, [i for i in target_grid]) pool.close() # print('--2nd Step: Start Polygon mapping--') pool = multiprocessing.Pool(ncpu) func_overlapping = partial(_map_overlapping_cells, fcst_grid_poly, fcst_cell_area, fcst_rate_poly) # Uses above three Global Parameters rate_tgt = pool.map(func_overlapping, [poly for poly in target_grid_poly]) pool.close() zero_pad_len = numpy.shape(fcst_rate)[1] for i in range(len(rate_tgt)): if len(rate_tgt[i]) < zero_pad_len: rate_tgt[i] = numpy.zeros(zero_pad_len) map_rate = numpy.add(rate_tgt, exact_rate_tgt) return map_rate def _set_dockerfile(name): string = f""" ## Install Docker image from trusted source FROM python:3.8.13 ## Setup user args ARG USERNAME={name} ARG USER_UID=1100 ARG USER_GID=$USER_UID RUN mkdir -p /usr/src/{name} && chown $USER_UID:$USER_GID /usr/src/{name} RUN groupadd --non-unique -g $USER_GID $USERNAME && useradd -u $USER_UID -g $USER_GID -s /bin/sh -m $USERNAME ## Set up work directory in the Docker container WORKDIR /usr/src/{name}/ ## Copy the files from the local machine (the repository) to the Docker container COPY --chown=$USER_UID:$USER_GID . /usr/src/{name}/ ## Calls setup.py, install python dependencies and install this model as a python module ENV VIRTUAL_ENV=/venv/ RUN python3 -m venv $VIRTUAL_ENV ENV PATH="$VIRTUAL_ENV/bin:$PATH" # RUN pip install --no-cache-dir --upgrade pip # RUN pip install -r requirements.txt RUN pip install numpy pandas h5py USER $USERNAME """ return string def _global_region(dh=0.1, name="global", magnitudes=None): """ Creates a global region used for evaluating gridded models on the global scale. Modified from csep.core.regions.global_region The gridded region corresponds to the Args: dh: Returns: csep.utils.CartesianGrid2D: """ # generate latitudes lons = numpy.arange(-180.0, 180, dh) lats = numpy.arange(-90, 90, dh) coords = itertools.product(lons, lats) region = CartesianGrid2D( [Polygon(bbox) for bbox in compute_vertices(coords, dh)], dh, name=name) if magnitudes is not None: region.magnitudes = magnitudes return region def _check_zero_bins(exp, catalog, test_date): for model in exp.models: forecast = model.create_forecast(exp.start_date, test_date) catalog.filter_spatial(forecast.region) bins = catalog.get_spatial_idx() zero_forecast = numpy.argwhere(forecast.spatial_counts()[bins] == 0) if zero_forecast: print(zero_forecast) ax = catalog.plot(plot_args={'basemap': 'stock_img'}) ax = forecast.plot(ax=ax, plot_args={'alpha': 0.8}) ax.plot(catalog.get_longitudes()[zero_forecast.ravel()], catalog.get_latitudes()[zero_forecast.ravel()], 'o', markersize=10) pyplot.savefig(f'{model.path}/{model.name}.png', dpi=300) for model in exp.models: forecast = model.create_forecast(exp.start_date, test_date) catalog.filter_spatial(forecast.region) sbins = catalog.get_spatial_idx() mbins = catalog.get_mag_idx() zero_forecast = numpy.argwhere(forecast.data[sbins, mbins] == 0) print('event', 'cell', sbins[zero_forecast], 'datum', catalog.data[zero_forecast]) if zero_forecast: print(zero_forecast) print('cellfc', forecast.get_longitudes()[sbins[zero_forecast]], forecast.get_latitudes()[sbins[zero_forecast]]) print('scounts', forecast.spatial_counts()[sbins[zero_forecast]]) print('data', forecast.data[sbins[zero_forecast]]) print(forecast.data[zero_forecast[0]]) ax = catalog.plot(plot_args={'basemap': 'stock_img'}) ax = forecast.plot(ax=ax, plot_args={'alpha': 0.8}) ax.plot(catalog.get_longitudes()[zero_forecast.ravel()], catalog.get_latitudes()[zero_forecast.ravel()], 'o', markersize=10) pyplot.savefig(f'{model.path}/{model.name}.png', dpi=300)