Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dskit/cleaning.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ def outlier_summary(df, method='iqr', threshold=1.5):
elif method == 'zscore':
mean = df[col].mean()
std = df[col].std()
if std == 0:
continue
z_scores = (df[col] - mean) / std
outliers = df[np.abs(z_scores) > 3]

Expand All @@ -139,6 +141,8 @@ def remove_outliers(df, method='iqr', threshold=1.5):
elif method == 'zscore':
mean = df[col].mean()
std = df[col].std()
if std==0:
continue
z_scores = (df[col] - mean) / std
df = df[np.abs(z_scores) <= 3]

Expand Down
48 changes: 44 additions & 4 deletions dskit/core.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
from . import io, cleaning, visualization, preprocessing, modeling, explainability, eda
from . import feature_engineering, nlp_utils, advanced_visualization, advanced_modeling, auto_ml, comprehensive_eda

from typing import Literal,Optional,Annotated
class dskit:
def __init__(self, df=None):
self.df = df
Expand All @@ -17,8 +17,8 @@ def load(filepath):
return dskit(io.load(filepath))

@staticmethod
def read_folder(folder_path, file_type='csv'):
return dskit(io.read_folder(folder_path, file_type))
def read_folder(folder_path:str, file_type:Literal['csv','xls','xlsx','json','parquet']='csv',dynamic:bool=False,display_ignored:bool=False):
return dskit(io.read_folder(folder_path, file_type,dynamic,display_ignored))

def save(self, filepath, **kwargs):
io.save(self.df, filepath, **kwargs)
Expand Down Expand Up @@ -73,7 +73,47 @@ def text_stats(self, text_cols=None):
def generate_wordcloud(self, text_col, max_words=100):
nlp_utils.generate_wordcloud(self.df, text_col, max_words)
return self

def generate_vocabulary(self,text_col:str,case:Literal['lower','upper']=None):
return nlp_utils.generate_vocabulary(self.df,text_col,case)
def apply_nltk(
self,
text_column:Annotated[str, "Column name containing raw text"],
output_column:Annotated[str, "Output column name for processed text"] = "cleaned_nltk",
apply_case:Annotated[
Optional[Literal['lower','upper','sentence','title']],
"Case transformation to apply"
] = 'lower',
allow_download:Annotated[bool,"Automatically download required NLTK resources if missing"]=False,
remove_stopwords:Annotated[bool,"Remove stopwords using NLTK stopword corpus"]=True,
keep_words:Annotated[
list[str],
"Words to retain even if stopword removal is enabled"
] = ["not", "no", "off"],
remove_words:Annotated[
list[str],
"Words to explicitly remove from the text"
]=[],
use_tokenizer:Annotated[bool,"Use NLTK tokenizer instead of simple whitespace split"]=True,
language:Annotated[str,"Language for stopword removal"]='english',
canonicalization:Annotated[
Optional[Literal['stemming', 'lemmatization']],
"Canonicalization strategy"
]='stemming'
)->pd.DataFrame:
self.df = nlp_utils.apply_nltk(
df=self.df,
text_column=text_column,
output_column=output_column,
apply_case=apply_case,
allow_download=allow_download,
remove_stopwords=remove_stopwords,
keep_words=keep_words,
remove_words=remove_words,
use_tokenizer=use_tokenizer,
language=language,
canonicalization=canonicalization
)
return self
def clean(self):
"""
Chains multiple cleaning steps: fix_dtypes -> rename_columns -> fill_missing
Expand Down
42 changes: 33 additions & 9 deletions dskit/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,25 +27,49 @@ def load(filepath):
print(f"Error loading file: {e}")
return None

def read_folder(folder_path, file_type='csv'):
def read_folder(folder_path:str, file_type:str='csv',dynamic:bool=False,display_ignored:bool=False):
"""
Loads multiple files from a folder and concatenates them.
Load and concatenate tabular files from a folder.
Parameters
----------
folder_path : str
Path to the directory containing the files to be loaded.

file_type : str, default='csv'
File extension to filter files when `dynamic=False`.
Example: 'csv', 'xlsx', 'parquet'.

dynamic : bool, default=False
If True, loads all files regardless of extension.
If False, only files matching `file_type` are loaded.

display_ignored : bool, default=False
If True, prints the list of files that were skipped
because they could not be loaded by the `load()` function.
"""
if not os.path.exists(folder_path):
raise FileNotFoundError(f"The folder '{folder_path}' was not found.")

all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))

if not all_files:
print(f"No files found with extension .{file_type} in {folder_path}")
return None
if dynamic:
all_files = glob.glob(os.path.join(folder_path, "*.*"))
if not all_files:
print(f"No files found!")
return None
else:
all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))
if not all_files:
print(f"No files found with extension .{file_type} in {folder_path}")
return None

df_list = []
ignored=[]
for filename in all_files:
df = load(filename)
if df is not None:
df_list.append(df)

else:
ignored.append(filename)
if display_ignored:
print("Ignored Files : ","\n".join(ignored))
if df_list:
return pd.concat(df_list, ignore_index=True)
else:
Expand Down
Loading