Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

patched dims of enrichment layers, formatting and linting #190

Merged
merged 2 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.7]
python-version: [3.7, 3.8, 3.9]

steps:
- uses: actions/checkout@v3
Expand Down
80 changes: 14 additions & 66 deletions phippery/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
# local
from phippery import utils
from phippery.string import string_ds
from phippery.utils import id_coordinate_from_query
from phippery.string import string_feature


# entry point
Expand Down Expand Up @@ -132,9 +134,6 @@ def about(filename, verbose):
click.echo(info)


from phippery.string import string_feature


@cli.command(name="about-feature")
@argument("feature", type=str)
@argument("filename", type=click.Path(exists=True))
Expand All @@ -143,20 +142,20 @@ def about(filename, verbose):
"--dimension",
type=click.Choice(["sample", "peptide"], case_sensitive=False),
default="sample",
help="The dimension we expect to find this feature"
help="The dimension we expect to find this feature",
)
@click.option(
"--distribution/--counts",
default=True,
help="Force a specific output of either value counts or distribution for quantitative features"
help="Force a specific output of either value counts or distribution for quantitative features",
)
# def string_feature(ds, feature: str, verbosity = 0, dim="sample"):
def about_feature(filename, dimension, feature, distribution):
"""
Summarize details about a specific sample or peptide annotation feature.

The function will tell you information about a specific feature
in you sample annnotation table, depending on it's inferred datatype.
in you sample annotation table, depending on it's inferred datatype.
For numeric feature types the command will get information about quantiles,
for categorical or boolean feature types, the function will give
individual factor-level counts.
Expand All @@ -177,51 +176,6 @@ def about_feature(filename, dimension, feature, distribution):
click.echo(info)


@cli.command(name="split-groups")
@click.option(
"-d",
"--dimension",
type=click.Choice(["sample", "peptide"], case_sensitive=False),
default="sample",
)
@option("split-features", type=str)
@argument("filename", type=click.Path(exists=True))
def query_expression(filename, expression, dimension, output):
"""
Perform a single pandas-style query expression on dataset samples

This command takes a single string query statement,
aplied it to the sample table in the dataset,
and returns the dataset with the slice applied.
This mean that all enrichment layers get sliced.
If no output (-o) is provided, by default this command
will overwrite the provided dataset file.

\f


.. note:: for more information on pandas query style strings,
please see the
`Pandas documentation
<https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html>`_
additionally, I've found `this blog
<https://queirozf.com/entries/pandas-query-examples-sql-like-syntax-queries-in-dataframes>`_
very helpful for performing queries on a dataframe.
"""

try:
ds = utils.load(filename)
except Exception as e:
click.echo(e)

if output == None:
output = "sliced_dataset.phip"

q = utils.id_query(ds, expression, dimension)

utils.dump(ds.loc[{f"{dimension}_id": q}], output)


@cli.command(name="merge")
@option("-o", "--output", type=click.Path(exists=False), default=None, required=False)
@argument(
Expand All @@ -235,7 +189,7 @@ def merge(output, datasets):
except Exception as e:
click.echo(e)

if output == None:
if output is None:
output = "merged_dataset.phip"

merged = xr.merge(dss)
Expand All @@ -261,7 +215,7 @@ def query_expression(filename, expression, dimension, output):
Perform a single pandas-style query expression on dataset samples

This command takes a single string query statement,
aplied it to the sample table in the dataset,
applied it to the sample table in the dataset,
and returns the dataset with the slice applied.
This mean that all enrichment layers get sliced.
If no output (-o) is provided, by default this command
Expand All @@ -284,7 +238,7 @@ def query_expression(filename, expression, dimension, output):
except Exception as e:
click.echo(e)

if output == None:
if output is None:
output = "sliced_dataset.phip"

q = utils.id_query(ds, expression, dimension)
Expand All @@ -300,7 +254,7 @@ def query_table(filename, expression_table, output):
"""
Perform dataset index a csv giving a set of query expressions

This command takes a dsvpoviding a set
This command takes a csv providing a set
of queries for both samples or peptide and
applies each to the respective annotation table in the dataset,
and returns the dataset with the slice applied.
Expand All @@ -316,7 +270,7 @@ def query_table(filename, expression_table, output):
:widths: 25 25
:header-rows: 1

* - dimension
* - dimension
- expression
* - sample
- "Cohort == 2.0"
Expand All @@ -338,7 +292,7 @@ def query_table(filename, expression_table, output):
click.echo(e)
return

if output == None:
if output is None:
if click.confirm(
f"Without providing output path, you overwrite {filename}. Do you want to continue?"
):
Expand Down Expand Up @@ -384,22 +338,16 @@ def to_tall_csv(filename: str, output: str):

# Open a connection to the output file
if output.endswith(".gz"):
handle = gzip.open(output, 'wt')
handle = gzip.open(output, "wt")
else:
handle = open(output, 'w')
handle = open(output, "w")

# Generate tall tables for each of the samples in turn
# Each of the tables for each sample will have the same column order
for i, sample_df in enumerate(utils.yield_tall(ds)):

# If it's the first one, include the header
handle.write(
sample_df.to_csv(
header=i == 0,
index=False,
na_rep="NA"
)
)
handle.write(sample_df.to_csv(header=i == 0, index=False, na_rep="NA"))
handle.close()


Expand Down
16 changes: 7 additions & 9 deletions phippery/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@

import numpy as np
import xarray as xr
import pandas as pd
import itertools
import copy
import scipy.stats as st
from phippery.gampois import fit_gamma
Expand Down Expand Up @@ -67,7 +65,7 @@ def gamma_poisson_model(

trim_percentile : float
The percentile cutoff for removing peptides with very high counts.
(e.g. a value of 98 means peptides in the highest 2% in counts
(e.g. a value of 98 means peptides in the highest 2% in counts
would be removed from the fit)
This parameter is used to remove potential signal peptides that
would bias the fit.
Expand Down Expand Up @@ -111,11 +109,11 @@ def gamma_poisson_model(
counts.loc[:, :] = mlxp_gamma_poisson(counts, background_rates)

if inplace:
ds[new_table_name] = xr.DataArray(counts)
ds[new_table_name] = xr.DataArray(counts, dims=ds[data_table].dims)
return (alpha, beta)
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = xr.DataArray(counts)
ds_copy[new_table_name] = xr.DataArray(counts, dims=ds[data_table].dims)
return (alpha, beta), ds_copy


Expand Down Expand Up @@ -153,13 +151,13 @@ def zscore(
background means and stddevs.

min_Npeptides_per_bin : int
Mininum number of peptides per bin.
Minimum number of peptides per bin.

lower_quantile_limit : float
Counts below this quantile are ignored for computing background mean and stddev.

upper_quantile_limit : float
Counts above this quantile are igonred for computing background mean and stddev.
Counts above this quantile are ignored for computing background mean and stddev.

data_table : str
The name of the enrichment layer from which you would like to compute Z-scores.
Expand Down Expand Up @@ -196,9 +194,9 @@ def zscore(
zscore_table.loc[:, :] = zs_df

if inplace:
ds[new_table_name] = xr.DataArray(zscore_table)
ds[new_table_name] = xr.DataArray(zscore_table, dims=ds[data_table].dims)
return None
else:
ds_copy = copy.deepcopy(ds)
ds_copy[new_table_name] = xr.DataArray(zscore_table)
ds_copy[new_table_name] = xr.DataArray(zscore_table, dims=ds[data_table].dims)
return ds_copy
Loading
Loading