Calculate

Summaries, prevalence tables, and association metrics.

summary()

python

phenofhy.calculate.summary(df, traits=None, *, stratify=None, sex_col="derived.sex",
	age_col="derived.age_at_registration", age_bins=None, round_decimals=2,
	categorical_traits=None, label_mode="codes", codebook_csv=None,
	metadata_dir="./metadata", data_dictionary_csv=None, local_codebook=None,
	autodetect_coded_categoricals=True, autodetect_max_levels=10,
	autodetect_exclude=None, sex_keep=None, granularity="variable")

Compute grouped summaries for numeric and categorical traits.

Parameters

  df: pandas.DataFrame
    Input dataframe with participant, questionnaire, and derived columns.
  traits: Iterable[str] | None
    Optional iterable of column names to summarize. If None, auto-detects usable columns and skips ones ending with .pid.
  stratify: str | dict | None
    None for whole-sample summary, a column name to stratify by, or a single-key dict {col: [values]} to restrict strata.
  sex_col: str
    Column name for sex (used for default age group derivation).
  age_col: str
    Column name for age (used for default age group derivation).
  age_bins: dict | None
    Optional dict with keys bins and labels for age groupings.
  round_decimals: int
    Number of decimals to round numeric outputs.
  categorical_traits: Iterable[str] | None
    Optional iterable of traits to treat as categorical.
  label_mode: Literal["labels", "codes"]
    labels to map codes to labels, codes to keep codes.
  codebook_csv: str | None
    Optional path to codings CSV; if None, resolved from metadata_dir.
  metadata_dir: str
    Directory to search for codings and metadata files.
  data_dictionary_csv: str | None
    Optional path to data dictionary CSV for trait descriptions.
  local_codebook: dict | None
    Optional mapping of derived column name to {code: label}.
  autodetect_coded_categoricals: bool
    Whether to auto-detect small-cardinality numeric categoricals.
  autodetect_max_levels: int
    Max unique values to treat numeric as categorical.
  autodetect_exclude: Iterable[str] | None
    Optional iterable of traits to exclude from auto-detect.
  sex_keep: dict | None
    Optional mapping to recode or keep sex values (currently unused).
  granularity: Literal["variable", "category"]
    variable for aggregated rows, category for per-category rows.

Returns

out: dict[str, pandas.DataFrame]
Dictionary with two DataFrames: numeric and categorical.

Example

python

from phenofhy import calculate

result = calculate.summary(df, traits=["derived.age_at_registration", "derived.bmi"], stratify="derived.sex", age_bins={"bins": [18, 30, 60, 120], "labels": ["18-29", "30-59", "60+"]})

prevalence()

python

phenofhy.calculate.prevalence(df, codings=None, traits=None, *, denominator="all",
	denominators=None, eligibility=None, wide_output=True,
	participant_col="participant.pid", metadata_dir="./metadata",
	codebook_csv=None, on_missing="warn", error_if_empty=False)

Compute prevalence counts and rates for coded or derived categorical traits.

Parameters

  df: pandas.DataFrame
    Input dataframe with participant, questionnaire, and derived columns.
  codings: pandas.DataFrame | dict | list | None
    Codings mapping or None to resolve from metadata_dir or codebook_csv.
  traits: Iterable[str] | None
    Optional traits to include; if None, all non-helper columns are used.
  denominator: Literal["all", "nonmissing"]
    all for total participants or nonmissing per trait.
  denominators: Iterable[str] | None
    Optional iterable of additional denominator keys; if provided, output includes prevalence per key.
  eligibility: dict | None
    Optional mapping of eligibility name to column or list of columns used for custom denominators.
  wide_output: bool
    If True, return wide columns for each denominator key.
  participant_col: str
    Column name for participant id.
  metadata_dir: str
    Directory to search for codings CSV if not provided.
  codebook_csv: str | None
    Optional path to codings CSV.
  on_missing: Literal["warn", "ignore", "error"]
    Behavior when metadata is missing.
  error_if_empty: bool
    If True, raise when no results are produced.

Returns

out: pandas.DataFrame
Prevalence counts and rates. Shape depends on denominator arguments and wide_output.

Example

python

from phenofhy import calculate

prev = calculate.prevalence(df, traits=["derived.sex", "participant.demog_ethnicity_1_1"], denominator="nonmissing")

medication_prevalence()

python

phenofhy.calculate.medication_prevalence(df, codings, medication_phenotypes, *,
	participant_col="participant.pid", denominator="all", return_what="both",
	fuzzy=True, fuzzy_cutoff=0.82, metadata_dir="./metadata", codebook_csv=None)

Compute medication prevalence for coded medication traits.

Parameters

  df: pandas.DataFrame
    Questionnaire or participant dataframe (fully-qualified columns).
  codings: pandas.DataFrame | dict | list | None
    Codings mapping or None to resolve from metadata_dir or codebook_csv.
  medication_phenotypes: DataFrame | list | dict
    Flexible specification of medication phenotypes. Each row resolves to (trait, coding_name, medication).
  participant_col: str
    Column name for participant id.
  denominator: Literal["all", "nonmissing"]
    Prevalence denominator.
  return_what: Literal["both", "per_medication", "group"]
    Return per-medication, grouped, or both.
  fuzzy: bool
    Whether to use fuzzy matching for medication meanings.
  fuzzy_cutoff: float
    Similarity cutoff for fuzzy matching.
  metadata_dir: str
    Directory to search for codings CSV if not provided.
  codebook_csv: str | None
    Optional path to codings CSV.

Returns

out: pandas.DataFrame | tuple[pandas.DataFrame, pandas.DataFrame]
Per-medication and grouped results depending on return_what.

Example

python

from phenofhy import calculate

per_med, grouped = calculate.medication_prevalence(df, codings=None, medication_phenotypes={"lipids": {"coding_name": "MEDICATIONS", "medication": ["Atorvastatin", "Simvastatin"]}}, return_what="both")

medication_summary()

python

phenofhy.calculate.medication_summary(df, *, med_prefix="derived.medicates_",
	group_map=DEFAULT_MEDICAT_GROUP_MAP, inplace=True, return_summary=False)

Derive medication usage-pattern variables and optional summary.

Parameters

  df: pandas.DataFrame | tuple
    Input dataframe or a (mapping_df, df) tuple (compatibility).
  med_prefix: str
    Prefix used to detect medication domain columns.
  group_map: dict | None
    Mapping of grouped medication flags to constituent columns.
  inplace: bool
    If True, mutate the input dataframe; otherwise return a copy.
  return_summary: bool
    If True, also return a summary DataFrame of derived vars.

Returns

out: pandas.DataFrame | tuple[pandas.DataFrame, pandas.DataFrame]
If return_summary is False, returns the mutated or copied DataFrame. If return_summary is True, returns (df, summary_df).

Example

python

from phenofhy import calculate

df2, summary_df = calculate.medication_summary(df, return_summary=True)

phi_corr()

python

phenofhy.calculate.phi_corr(df, vars_for_heatmap=None, *,
	med_prefix="derived.medicates_", outdir=None, save_basename="phi_corr")

Compute a Pearson correlation matrix suitable for heatmaps.

Parameters

  df: pandas.DataFrame
    Input dataframe.
  vars_for_heatmap: Iterable[str] | None
    Optional iterable of columns to include. If None, uses medication prefix columns plus common usage-pattern vars.
  med_prefix: str
    Prefix used to select medication columns when vars_for_heatmap is None.
  outdir: str | None
    Optional directory to save CSV or parquet outputs.
  save_basename: str
    Basename for saved outputs if outdir is provided.

Returns

out: pandas.DataFrame
Square correlation matrix with values in [-1, 1].

Example

python

from phenofhy import calculate

corr = calculate.phi_corr(df, outdir="outputs", save_basename="med_phi_corr")

matthews_corrcoef_series()

python

phenofhy.calculate.matthews_corrcoef_series(a, b)

Compute Matthews correlation coefficient (phi) for two series.

Parameters

  a: pandas.Series
    Series to compare (binarized if needed).
  b: pandas.Series
    Series to compare (binarized if needed).

Returns

out: float
Phi coefficient in [-1, 1]. If the confusion matrix is degenerate, returns Pearson on the binarized arrays or 0.0.

Raises

ValueError: Exception
If either series is not binary-like after binarization.

Example

python

from phenofhy import calculate

phi = calculate.matthews_corrcoef_series(df["derived.any_meds_flag"], df["derived.polypharmacy_flag"])

Calculate ​

summary() ​

prevalence() ​

medication_prevalence() ​

medication_summary() ​

phi_corr() ​

matthews_corrcoef_series() ​

Calculate

summary()

prevalence()

medication_prevalence()

medication_summary()

phi_corr()

matthews_corrcoef_series()