Programmers-Paradise · Aksh-Agrawal · Jan 20, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/dskit/cleaning.py b/dskit/cleaning.py
@@ -113,6 +113,8 @@ def outlier_summary(df, method='iqr', threshold=1.5):
         elif method == 'zscore':
             mean = df[col].mean()
             std = df[col].std()
+            if std == 0:
+                continue
             z_scores = (df[col] - mean) / std
             outliers = df[np.abs(z_scores) > 3]
 
@@ -139,6 +141,8 @@ def remove_outliers(df, method='iqr', threshold=1.5):
         elif method == 'zscore':
             mean = df[col].mean()
             std = df[col].std()
+            if std==0:
+                continue
             z_scores = (df[col] - mean) / std
             df = df[np.abs(z_scores) <= 3]
 

diff --git a/dskit/core.py b/dskit/core.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from . import io, cleaning, visualization, preprocessing, modeling, explainability, eda
 from . import feature_engineering, nlp_utils, advanced_visualization, advanced_modeling, auto_ml, comprehensive_eda
-
+from typing import Literal,Optional,Annotated
 class dskit:
     def __init__(self, df=None):
         self.df = df
@@ -17,8 +17,8 @@ def load(filepath):
         return dskit(io.load(filepath))
 
     @staticmethod
-    def read_folder(folder_path, file_type='csv'):
-        return dskit(io.read_folder(folder_path, file_type))
+    def read_folder(folder_path:str, file_type:Literal['csv','xls','xlsx','json','parquet']='csv',dynamic:bool=False,display_ignored:bool=False):
+        return dskit(io.read_folder(folder_path, file_type,dynamic,display_ignored))
 
     def save(self, filepath, **kwargs):
         io.save(self.df, filepath, **kwargs)
@@ -73,7 +73,47 @@ def text_stats(self, text_cols=None):
     def generate_wordcloud(self, text_col, max_words=100):
         nlp_utils.generate_wordcloud(self.df, text_col, max_words)
         return self
-
+    def generate_vocabulary(self,text_col:str,case:Literal['lower','upper']=None):
+        return nlp_utils.generate_vocabulary(self.df,text_col,case)
+    def apply_nltk(
+            self,
+            text_column:Annotated[str, "Column name containing raw text"],
+            output_column:Annotated[str, "Output column name for processed text"] = "cleaned_nltk",
+            apply_case:Annotated[
+                Optional[Literal['lower','upper','sentence','title']],
+                "Case transformation to apply"
+            ] = 'lower',
+            allow_download:Annotated[bool,"Automatically download required NLTK resources if missing"]=False,
+            remove_stopwords:Annotated[bool,"Remove stopwords using NLTK stopword corpus"]=True,
+            keep_words:Annotated[
+                list[str],
+                "Words to retain even if stopword removal is enabled"
+            ] = ["not", "no", "off"],
+            remove_words:Annotated[
+                list[str],
+                "Words to explicitly remove from the text"
+                ]=[],
+            use_tokenizer:Annotated[bool,"Use NLTK tokenizer instead of simple whitespace split"]=True,
+            language:Annotated[str,"Language for stopword removal"]='english',
+            canonicalization:Annotated[
+                Optional[Literal['stemming', 'lemmatization']],
+                "Canonicalization strategy"
+                ]='stemming'
+        )->pd.DataFrame:
+        self.df = nlp_utils.apply_nltk(
+            df=self.df,
+            text_column=text_column,
+            output_column=output_column,
+            apply_case=apply_case,
+            allow_download=allow_download,
+            remove_stopwords=remove_stopwords,
+            keep_words=keep_words,
+            remove_words=remove_words,
+            use_tokenizer=use_tokenizer,
+            language=language,
+            canonicalization=canonicalization
+        )
+        return self
     def clean(self):
         """
         Chains multiple cleaning steps: fix_dtypes -> rename_columns -> fill_missing

diff --git a/dskit/io.py b/dskit/io.py
@@ -27,25 +27,49 @@ def load(filepath):
         print(f"Error loading file: {e}")
         return None
 
-def read_folder(folder_path, file_type='csv'):
+def read_folder(folder_path:str, file_type:str='csv',dynamic:bool=False,display_ignored:bool=False):
     """
-    Loads multiple files from a folder and concatenates them.
+    Load and concatenate tabular files from a folder.
+    Parameters
+    ----------
+    folder_path : str
+        Path to the directory containing the files to be loaded.
+
+    file_type : str, default='csv'
+        File extension to filter files when `dynamic=False`.
+        Example: 'csv', 'xlsx', 'parquet'.
+
+    dynamic : bool, default=False
+        If True, loads all files regardless of extension.
+        If False, only files matching `file_type` are loaded.
+
+    display_ignored : bool, default=False
+        If True, prints the list of files that were skipped
+        because they could not be loaded by the `load()` function.
     """
     if not os.path.exists(folder_path):
         raise FileNotFoundError(f"The folder '{folder_path}' was not found.")
-
-    all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))
-
-    if not all_files:
-        print(f"No files found with extension .{file_type} in {folder_path}")
-        return None
+    if dynamic:
+        all_files = glob.glob(os.path.join(folder_path, "*.*"))
+        if not all_files:
+            print(f"No files found!")
+            return None
+    else:
+        all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))
+        if not all_files:
+            print(f"No files found with extension .{file_type} in {folder_path}")
+            return None
 
     df_list = []
+    ignored=[]
     for filename in all_files:
         df = load(filename)
         if df is not None:
             df_list.append(df)
-
+        else:
+            ignored.append(filename)
+    if display_ignored:
+        print("Ignored Files : ","\n".join(ignored))
     if df_list:
         return pd.concat(df_list, ignore_index=True)
     else: