#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Time-stamp: "2024-11-05 07:37:09 (ywatanabe)"
# File: ./scitex_repo/src/scitex/pd/_merge_columns.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Time-stamp: "2024-10-07 12:03:29 (ywatanabe)"
# ./src/scitex/pd/_merge_cols.py
from typing import List, Tuple, Union
import pandas as pd
[docs]
def merge_columns(
df: pd.DataFrame,
*args: Union[str, List[str], Tuple[str, ...]],
sep: str = None,
sep1: str = "_",
sep2: str = "-",
name: str = "merged",
) -> pd.DataFrame:
"""Creates a new column by joining specified columns.
Example
-------
>>> df = pd.DataFrame({
... 'A': [0, 5, 10],
... 'B': [1, 6, 11],
... 'C': [2, 7, 12]
... })
>>> # Simple concatenation with separator
>>> merge_columns(df, 'A', 'B', sep=' ')
A B C A_B
0 0 1 2 0 1
1 5 6 7 5 6
2 10 11 12 10 11
>>> # With column labels
>>> merge_columns(df, 'A', 'B', sep1='_', sep2='-')
A B C A_B
0 0 1 2 A-0_B-1
1 5 6 7 A-5_B-6
2 10 11 12 A-10_B-11
Parameters
----------
df : pd.DataFrame
Input DataFrame
*args : Union[str, List[str], Tuple[str, ...]]
Column names to join
sep : str, optional
Simple separator for values only (overrides sep1/sep2)
sep1 : str, optional
Separator between column-value pairs, by default "_"
sep2 : str, optional
Separator between column name and value, by default "-"
name : str, optional
Name for the merged column, by default "merged"
Returns
-------
pd.DataFrame
DataFrame with added merged column
"""
_df = df.copy()
columns = args[0] if len(args) == 1 and isinstance(args[0], (list, tuple)) else args
if not columns:
raise ValueError("No columns specified for merging")
if not all(col in _df.columns for col in columns):
missing = [col for col in columns if col not in _df.columns]
raise KeyError(f"Columns not found in DataFrame: {missing}")
# Handle empty DataFrame case
if len(_df) == 0:
# Determine column name
if name == "merged" and sep is not None:
new_col_name = "_".join(columns)
else:
new_col_name = name
# Create empty Series with the correct name
_df[new_col_name] = pd.Series(dtype=str)
return _df
if sep is not None:
# Simple value concatenation
merged_col = (
_df[list(columns)]
.astype(str)
.apply(
lambda row: sep.join(row.values),
axis=1,
)
)
else:
# Concatenation with column labels
merged_col = _df[list(columns)].apply(
lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()),
axis=1,
)
# Determine column name
if name == "merged" and sep is not None:
# When using simple separator and default name, use joined column names
new_col_name = "_".join(columns)
else:
# Use provided name or default
new_col_name = name
_df[new_col_name] = merged_col
return _df
merge_cols = merge_columns
# EOF
# #!./env/bin/python3
# # -*- coding: utf-8 -*-
# # Time-stamp: "2024-10-07 12:03:29 (ywatanabe)"
# # ./src/scitex/pd/_merge_cols.py
# def merge_columns(df, *args, sep1="_", sep2="-", name="merged"):
# """
# Join specified columns with their labels.
# Example:
# import pandas as pd
# import numpy as np
# df = pd.DataFrame(
# data=np.arange(25).reshape(5, 5),
# columns=["A", "B", "C", "D", "E"],
# )
# df1 = merge_columns(df, "A", "B", sep1="_", sep2="-")
# df2 = merge_columns(df, ["A", "B"], sep1="_", sep2="-")
# assert (df1 == df2).all().all() # True
# # A B C D E A_B
# # 0 0 1 2 3 4 A-0_B-1
# # 1 5 6 7 8 9 A-5_B-6
# # 2 10 11 12 13 14 A-10_B-11
# # 3 15 16 17 18 19 A-15_B-16
# # 4 20 21 22 23 24 A-20_B-21
# Parameters
# ----------
# df : pandas.DataFrame
# Input DataFrame
# *args : str or list
# Column names to join, either as separate arguments or a single list
# sep1 : str, optional
# Separator for joining column names, default "_"
# sep2 : str, optional
# Separator between column name and value, default "-"
# Returns
# -------
# pandas.DataFrame
# DataFrame with added merged column
# """
# _df = df.copy()
# columns = (
# args[0]
# if len(args) == 1 and isinstance(args[0], (list, tuple))
# else args
# )
# merged_col = _df[list(columns)].apply(
# lambda row: sep1.join(f"{col}{sep2}{val}" for col, val in row.items()),
# axis=1,
# )
# new_col_name = sep1.join(columns) if not name else str(name)
# _df[new_col_name] = merged_col
# return _df
# merge_cols = merge_columns
# # def merge_columns(_df, *columns):
# # """
# # Add merged columns in string.
# # DF = pd.DataFrame(data=np.arange(25).reshape(5,5),
# # columns=["A", "B", "C", "D", "E"],
# # )
# # print(DF)
# # # A B C D E
# # # 0 0 1 2 3 4
# # # 1 5 6 7 8 9
# # # 2 10 11 12 13 14
# # # 3 15 16 17 18 19
# # # 4 20 21 22 23 24
# # print(merge_columns(DF, "A", "B", "C"))
# # # A B C D E A_B_C
# # # 0 0 1 2 3 4 0_1_2
# # # 1 5 6 7 8 9 5_6_7
# # # 2 10 11 12 13 14 10_11_12
# # # 3 15 16 17 18 19 15_16_17
# # # 4 20 21 22 23 24 20_21_22
# # """
# # from copy import deepcopy
# # df = deepcopy(_df)
# # merged = deepcopy(df[columns[0]]) # initialization
# # for c in columns[1:]:
# # merged = scitex.ai.utils.merge_labels(list(merged), deepcopy(df[c]))
# # df.loc[:, scitex.gen.connect_strs(columns)] = merged
# # return df
# EOF