Skip to content

data.analysis.description

Data - Analysis - Descriptions¤

count_column_values_within_ranges(df_inp, column_name, bins=None) ¤

Count the number of values of a specific column according to the define ranges.

Parameters:

Name Type Description Default
df_inp pd.DataFrame

pandas dataframe

required
column_name

column name to be counted

required
bins

list of values to be used as the ranges

None
Source code in dietbox/data/analysis/descriptions.py
def count_column_values_within_ranges(df_inp, column_name, bins=None):
    """
    Count the number of values of a specific column according to the define ranges.

    :param pd.DataFrame df_inp: pandas dataframe
    :param column_name: column name to be counted
    :param bins: list of values to be used as the ranges
    """

    if bins is None:
        bins = np.arange(0, 7000, 100)

    all_values = df_inp[column_name].values

    # TODO: check type then convert
    all_values = all_values.astype(np.float)

    try:
        df_inp.loc[:, "count"] = pd.cut(df_inp[column_name].astype(float), bins)
    except Exception as e:
        print(e)
        print("pd.cut produces", pd.cut(df_inp[column_name].astype(float), bins))
        raise Exception("Can not set cut to column")

    df_counting = df_inp["count"].value_counts().sort_index()
    df_counting = df_counting.to_frame().reset_index()
    df_counting.columns = ["prices", "count"]
    df_counting.loc[:, "percent"] = df_counting["count"] / df_counting["count"].sum()
    couting_data = {
        "price": bins[:-1],
        "count": df_counting["count"].values,
        "percent": df_counting["percent"].values,
        "all_prices": all_values,
    }

    return couting_data

count_column_values_within_ranges_two_levels_deep(df_inp, first_groupby_column_name, second_groupby_column_name, count_column_name, bins=None) ¤

Count column values within ranges, but groupby twice in dataframe

Source code in dietbox/data/analysis/descriptions.py
def count_column_values_within_ranges_two_levels_deep(
    df_inp,
    first_groupby_column_name,
    second_groupby_column_name,
    count_column_name,
    bins=None,
):
    """
    Count column values within ranges, but groupby twice in dataframe
    """

    if bins is None:
        logger.warning("No bins specified, will use a default range 0-10000")
        bins = np.arange(0, 10000, 100)

    df_first_groups = df_inp.groupby(first_groupby_column_name)
    list_of_first_groups = []
    return_data = {}

    for first_groups_key, one_df_of_first_groups in df_first_groups:
        list_of_first_groups.append(first_groups_key)
        counting_data_of_one_group = {}
        df_second_level_groups = one_df_of_first_groups.groupby(
            second_groupby_column_name
        )
        for second_groups_key, one_df_of_second_groups in df_second_level_groups:
            counting_data_of_one_group[
                second_groups_key
            ] = count_column_values_within_ranges(
                one_df_of_second_groups, count_column_name, bins
            )
        return_data[first_groups_key] = counting_data_of_one_group

    return return_data