Source code for tfs_files.diff_files

"""
Module tfs_files.diff_files
-------------------------------

Functions to get the difference between two tfs files or dataframes.
This is very general, i.e. not as results oriented as ``getdiff.py``.
"""
from tfs_files import tfs_pandas
import pandas as pd
import numpy as np


[docs]def get_diff_two_dataframes(df1, df2, diff_columns, is_error=None, prefix="", index=None, keep_colums=(), out_file=None): """ Get the difference of common elements of specific columns between two dataframes. Merges on index. Args: df1 (DataFrame or Path): First dataframe, Minuend df2 (DataFrame or Path): Second dataframe, Subtrahend diff_columns (list of stings): List of columns to get the difference of is_error (list of booleans): defines if the column in question is an error column prefix (str): Prefix for difference columns (default: "") index (str): index column - most likely needed when reading/writing files keep_colums (list of strings): additional columns to keep in the returned dataframe out_file (Path): if given, writes the result into this file Returns: DataFrame containing difference columns and kept columns. """ # convert from files to dataframes df1 = _get_dataframe(df1, index) df2 = _get_dataframe(df2, index) # check input _check_for_missing_columns(df1, df2, diff_columns) _check_for_missing_columns(df1, df2, keep_colums) # merge dataframes merged = pd.merge(df1, df2, how='inner', left_index=True, right_index=True, suffixes=('_df1', '_df2')) # calculate difference if is_error is None: is_error = [False] * len(diff_columns) else: if len(is_error) != len(diff_columns): raise ValueError( "The length of the is_error switch needs to correspond to the columns." ) for idx, col in enumerate(diff_columns): if is_error[idx]: merged[prefix + col] = .5 * np.sqrt(np.square(merged[col + '_df1']) + np.square(merged[col + '_df2'])) else: merged[prefix + col] = merged[col + '_df1'] - merged[col + '_df2'] # copy columns to be kept for col in keep_colums: for suffix in ["", "_df1", "_df2"]: try: merged[col] = merged[col + suffix] except KeyError: pass else: break # prepare output merged = merged.loc[:, keep_colums + [prefix + c for c in diff_columns]] if out_file: tfs_pandas.write_tfs(out_file, merged, save_index=index) return merged
# Helpers ##################################################################### def _get_dataframe(tfs_df, index): try: return tfs_pandas.read_tfs(tfs_df, index=index) except TypeError: return tfs_df def _check_for_missing_columns(df1, df2, columns): missing_columns = [col for col in columns for df in [df1, df2] if col not in df.columns] if any(missing_columns): raise KeyError( "The following columns can not be found in either dataframe: {:}".format( list(set(missing_columns))) )