diff --git a/dskit/cleaning.py b/dskit/cleaning.py
index 85b1e90..bb28ee5 100644
--- a/dskit/cleaning.py
+++ b/dskit/cleaning.py
@@ -113,6 +113,8 @@ def outlier_summary(df, method='iqr', threshold=1.5):
         elif method == 'zscore':
             mean = df[col].mean()
             std = df[col].std()
+            if std == 0:
+                continue
             z_scores = (df[col] - mean) / std
             outliers = df[np.abs(z_scores) > 3]
         
@@ -139,6 +141,8 @@ def remove_outliers(df, method='iqr', threshold=1.5):
         elif method == 'zscore':
             mean = df[col].mean()
             std = df[col].std()
+            if std==0:
+                continue
             z_scores = (df[col] - mean) / std
             df = df[np.abs(z_scores) <= 3]
             
diff --git a/dskit/core.py b/dskit/core.py
index b400fcb..57725d7 100644
--- a/dskit/core.py
+++ b/dskit/core.py
@@ -1,7 +1,7 @@
 import pandas as pd
 from . import io, cleaning, visualization, preprocessing, modeling, explainability, eda
 from . import feature_engineering, nlp_utils, advanced_visualization, advanced_modeling, auto_ml, comprehensive_eda
-
+from typing import Literal,Optional,Annotated
 class dskit:
     def __init__(self, df=None):
         self.df = df
@@ -17,8 +17,8 @@ def load(filepath):
         return dskit(io.load(filepath))
 
     @staticmethod
-    def read_folder(folder_path, file_type='csv'):
-        return dskit(io.read_folder(folder_path, file_type))
+    def read_folder(folder_path:str, file_type:Literal['csv','xls','xlsx','json','parquet']='csv',dynamic:bool=False,display_ignored:bool=False):
+        return dskit(io.read_folder(folder_path, file_type,dynamic,display_ignored))
     
     def save(self, filepath, **kwargs):
         io.save(self.df, filepath, **kwargs)
@@ -73,7 +73,47 @@ def text_stats(self, text_cols=None):
     def generate_wordcloud(self, text_col, max_words=100):
         nlp_utils.generate_wordcloud(self.df, text_col, max_words)
         return self
-
+    def generate_vocabulary(self,text_col:str,case:Literal['lower','upper']=None):
+        return nlp_utils.generate_vocabulary(self.df,text_col,case)
+    def apply_nltk(
+            self,
+            text_column:Annotated[str, "Column name containing raw text"],
+            output_column:Annotated[str, "Output column name for processed text"] = "cleaned_nltk",
+            apply_case:Annotated[
+                Optional[Literal['lower','upper','sentence','title']],
+                "Case transformation to apply"
+            ] = 'lower',
+            allow_download:Annotated[bool,"Automatically download required NLTK resources if missing"]=False,
+            remove_stopwords:Annotated[bool,"Remove stopwords using NLTK stopword corpus"]=True,
+            keep_words:Annotated[
+                list[str],
+                "Words to retain even if stopword removal is enabled"
+            ] = ["not", "no", "off"],
+            remove_words:Annotated[
+                list[str],
+                "Words to explicitly remove from the text"
+                ]=[],
+            use_tokenizer:Annotated[bool,"Use NLTK tokenizer instead of simple whitespace split"]=True,
+            language:Annotated[str,"Language for stopword removal"]='english',
+            canonicalization:Annotated[
+                Optional[Literal['stemming', 'lemmatization']],
+                "Canonicalization strategy"
+                ]='stemming'
+        )->pd.DataFrame:
+        self.df = nlp_utils.apply_nltk(
+            df=self.df,
+            text_column=text_column,
+            output_column=output_column,
+            apply_case=apply_case,
+            allow_download=allow_download,
+            remove_stopwords=remove_stopwords,
+            keep_words=keep_words,
+            remove_words=remove_words,
+            use_tokenizer=use_tokenizer,
+            language=language,
+            canonicalization=canonicalization
+        )
+        return self
     def clean(self):
         """
         Chains multiple cleaning steps: fix_dtypes -> rename_columns -> fill_missing
diff --git a/dskit/io.py b/dskit/io.py
index 8e8c9c7..818e1e5 100644
--- a/dskit/io.py
+++ b/dskit/io.py
@@ -27,25 +27,49 @@ def load(filepath):
         print(f"Error loading file: {e}")
         return None
 
-def read_folder(folder_path, file_type='csv'):
+def read_folder(folder_path:str, file_type:str='csv',dynamic:bool=False,display_ignored:bool=False):
     """
-    Loads multiple files from a folder and concatenates them.
+    Load and concatenate tabular files from a folder.
+    Parameters
+    ----------
+    folder_path : str
+        Path to the directory containing the files to be loaded.
+
+    file_type : str, default='csv'
+        File extension to filter files when `dynamic=False`.
+        Example: 'csv', 'xlsx', 'parquet'.
+
+    dynamic : bool, default=False
+        If True, loads all files regardless of extension.
+        If False, only files matching `file_type` are loaded.
+
+    display_ignored : bool, default=False
+        If True, prints the list of files that were skipped
+        because they could not be loaded by the `load()` function.
     """
     if not os.path.exists(folder_path):
         raise FileNotFoundError(f"The folder '{folder_path}' was not found.")
-
-    all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))
-    
-    if not all_files:
-        print(f"No files found with extension .{file_type} in {folder_path}")
-        return None
+    if dynamic:
+        all_files = glob.glob(os.path.join(folder_path, "*.*"))
+        if not all_files:
+            print(f"No files found!")
+            return None
+    else:
+        all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}"))
+        if not all_files:
+            print(f"No files found with extension .{file_type} in {folder_path}")
+            return None
 
     df_list = []
+    ignored=[]
     for filename in all_files:
         df = load(filename)
         if df is not None:
             df_list.append(df)
-
+        else:
+            ignored.append(filename)
+    if display_ignored:
+        print("Ignored Files : ","\n".join(ignored))
     if df_list:
         return pd.concat(df_list, ignore_index=True)
     else:
diff --git a/dskit/nlp_utils.py b/dskit/nlp_utils.py
index 42a56dc..99f6f19 100644
--- a/dskit/nlp_utils.py
+++ b/dskit/nlp_utils.py
@@ -1,16 +1,19 @@
 import pandas as pd
 import numpy as np
+from typing import Literal, Optional, Iterable, Callable
+from collections import OrderedDict
+import re
+import warnings
+import importlib
+import string
+import matplotlib.pyplot as plt
 try:
-    import nltk
     from textblob import TextBlob
     from wordcloud import WordCloud
 except ImportError:
-    nltk = None
     TextBlob = None
     WordCloud = None
-import re
-import string
-import matplotlib.pyplot as plt
+
 
 def basic_text_stats(df, text_cols=None):
     """
@@ -230,4 +233,267 @@ def detect_language(df, text_col):
             languages.append('unknown')
     
     df[f'{text_col}_language'] = languages
-    return df
\ No newline at end of file
+    return df
+
+
+def generate_vocabulary(df:pd.DataFrame,text_col:str,case:Literal['lower','upper']=None):
+    """
+    returns a list of vocabulary made from a column of dataframe
+    
+    :param df: dataframe
+    :type df: pd.DataFrame
+    :param text_col: name of the text column
+    :type text_col: str
+    :param case: case of text. If not provided then words remains unchanged
+    :type case: Literal['lower', 'upper']
+    """
+    if text_col not in df.columns:
+        print(f"Column '{text_col}' not found.")
+        return []
+    vocabulary = set()
+    for text in df[text_col].astype(str):
+        if case=='lower':
+            text=text.lower()
+        elif case=='upper':
+            text==text.upper()
+        text = text.split()
+        for t in text:
+            vocabulary.add(t)
+    return list(vocabulary)
+
+
+
+# nltk application
+_init_nltk_cache=OrderedDict()
+_MAX_CACHE_SIZE = 2
+def _get_from_cache(key):
+    try:
+        value = _init_nltk_cache.pop(key)
+        _init_nltk_cache[key] = value  # move to end
+        return value
+    except KeyError:
+        return None
+
+def _set_cache(key, value):
+    if key in _init_nltk_cache:
+        _init_nltk_cache.pop(key)
+    elif len(_init_nltk_cache) >= _MAX_CACHE_SIZE:
+        _init_nltk_cache.popitem(last=False)  # evict LRU
+    _init_nltk_cache[key] = value
+
+
+class NLTKUnavailable(Exception):
+    pass
+
+
+def _init_nltk(download_list:Optional[Iterable[str]]=None,allow_download: bool = False, language: str = "english",keep_words:Optional[Iterable[str]]=None,remove_words:Optional[Iterable[str]]=None)->dict[str,object]:
+    """
+    Lazy initialize and return a dict with objects: tokenizer, stopwords_set, stemmer/lemmatizer.
+    Raises NLTKUnavailable if nltk not present and allow_download is False.
+    """
+    env=None
+    download_tuple = tuple(sorted([d for d in (download_list or []) if d]))
+    keep_tuple = tuple(sorted([k for k in (keep_words or []) if k]))
+    remove_tuple = tuple(sorted([r for r in (keep_words or []) if r]))
+    key = (download_tuple, bool(allow_download), language, keep_tuple, remove_tuple)
+    env=_get_from_cache(key)
+    if env is not None:
+        return env
+
+    try:
+        nltk = importlib.import_module("nltk")
+    except ImportError:
+        raise NLTKUnavailable("nltk is not installed")
+
+    # helper to check resource and optionally download
+    def _ensure(resource_name: str, download_name: Optional[str] = None):
+        try:
+            nltk.data.find(resource_name)
+        except LookupError:
+            if allow_download:
+                download_target = download_name or resource_name
+                nltk.download(download_target, quiet=True)
+            else:
+                raise LookupError(f"NLTK resource '{resource_name}' not found. Set allow_download=True or install resources manually.")
+
+    tokenizer=None
+    stopwords_set = None
+    stemmer = None
+    lemmatizer = None
+    # punkt is used by word_tokenize
+    if 'tokenizer' in download_list:
+        _ensure("tokenizers/punkt", "punkt")
+        tokenizer = nltk.word_tokenize
+        
+
+    
+    if 'stopwords' in download_list:
+        _ensure("corpora/stopwords", "stopwords")
+        from nltk.corpus import stopwords as _nltk_stopwords
+        stopwords_set = set(_nltk_stopwords.words(language))
+        if isinstance(keep_words,(list,tuple)) and len(keep_words)>0:
+            for kword in keep_words:
+                stopwords_set.discard(kword)
+        if isinstance(remove_words,(list,tuple)) and len(remove_words)>0:
+            for rword in remove_words:
+                stopwords_set.add(rword)
+        
+
+    
+    # Initialize both, user picks which to use
+    if 'stemming' in download_list:
+        from nltk.stem import PorterStemmer
+        stemmer = PorterStemmer()
+    if 'lemmatization' in download_list:
+        _ensure("corpora/wordnet", "wordnet")
+        from nltk.stem import WordNetLemmatizer
+        lemmatizer = WordNetLemmatizer()
+    env = {
+        "nltk": nltk,
+        "tokenizer": tokenizer,
+        "stopwords": stopwords_set,
+        "stemmer": stemmer,
+        "lemmatizer": lemmatizer,
+    }
+    _set_cache(key,env)
+    return env
+def apply_nltk(
+        df:pd.DataFrame,
+        text_column:str,
+        output_column:str="claeaned_nltk",
+        apply_case:Optional[Literal['lower','upper','sentence','title']]=None,
+        allow_download:bool=False,
+        remove_stopwords:bool=False,
+        keep_words:list=["not","no","off"],
+        remove_words:list=[],
+        use_tokenizer:bool=False,
+        language:str='english',
+        canonicalization:Optional[Literal['stemming', 'lemmatization']]=None
+        )->pd.DataFrame:
+    """
+    Apply advanced text preprocessing using optional NLTK-based features.
+
+    This function performs configurable text normalization on a specified
+    DataFrame column, including case transformation, tokenization,
+    stopword removal, and canonicalization (stemming or lemmatization).
+    NLTK is treated as an optional dependency and is only initialized
+    when explicitly required by the chosen options.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input DataFrame containing the text data.
+
+    text_column : str
+        Name of the column in `df` that contains text to be processed.
+
+    output_column : str, default="claeaned_nltk"
+        Name of the output column where the processed text will be stored.
+
+    apply_case : {"lower", "upper", "sentence", "title"}, optional
+        Case transformation to apply to the text:
+
+    allow_download : bool, default=False
+        If True, automatically downloads required NLTK resources
+        (e.g., punkt, stopwords, wordnet) when missing.
+        If False, missing resources will raise an error.
+
+    remove_stopwords : bool, default=False
+        If True, removes stopwords using the specified language.
+
+    keep_words : list of str, default=["not", "no", "off"]
+        Words that should be retained even if stopword removal is enabled.
+        Useful for preserving negations.
+
+    remove_words : list of str, default=[]
+        Explicit list of words to remove from the text regardless
+        of stopword settings.
+
+    use_tokenizer : bool, default=False
+        If True, uses NLTK's tokenizer (requires `punkt`).
+        If False, falls back to a lightweight whitespace-based tokenizer.
+
+    language : str, default="english"
+        Language used for stopword removal.
+
+    canonicalization : {"stemming", "lemmatization"}, optional
+        Word normalization strategy:
+        - "stemming"       : applies Porter stemming
+        - "lemmatization"  : applies WordNet lemmatization
+
+    Returns
+    -------
+    pandas.DataFrame
+        A copy of the input DataFrame with an additional column
+        containing the processed text.
+
+    Raises
+    ------
+    KeyError
+        If `text_column` does not exist in the DataFrame.
+
+    ImportError
+        If NLTK is required but not installed.
+
+    LookupError
+        If required NLTK resources are missing and `allow_download=False`.
+
+    Notes
+    -----
+    - NLTK initialization and resource loading are cached internally
+      to avoid repeated overhead across multiple calls.
+
+    """
+    if text_column not in df.columns:
+        raise IndexError(f"Column '{text_column}' not found.") 
+    df=df.copy()
+    text_field = df[text_column].astype('str')
+    tokenizer: Callable[[str], list[str]] = None
+    stopwords_set: Optional[set[str]] = set()
+    stemmer = None
+    lemmatizer = None
+    download_list = []
+    if canonicalization:
+        download_list.append(canonicalization)
+    if remove_stopwords:
+        download_list.append('stopwords')
+    if use_tokenizer:
+        download_list.append('tokenizer')
+    try:
+        env = _init_nltk(download_list=download_list,allow_download=allow_download, language=language,keep_words=keep_words,remove_words=remove_words)
+        tokenizer = env["tokenizer"]
+        stopwords_set = env["stopwords"]
+        stemmer = env["stemmer"]
+        lemmatizer = env["lemmatizer"]
+    except (NLTKUnavailable, LookupError) as e:
+        warnings.warn(f"NLTK unavailable or missing corpora: Falling back to lightweight tokenizer. Set allow_download=True to auto-download resources. {e}", UserWarning,stacklevel=2)
+
+
+    def _apply(text):
+        text=text.lower()
+        if use_tokenizer:
+            text = tokenizer(text)
+        else:
+            text = text.split()
+        
+        if canonicalization == 'stemming':
+            text = [stemmer.stem(word) for word in text if word not in stopwords_set]
+        elif canonicalization == 'lemmatization':
+            text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords_set]
+        else:
+            warnings.warn("No canonicalization used",UserWarning,stacklevel=2)
+            if remove_stopwords:
+                text = [word for word in text if word not in stopwords_set]
+        
+        text =' '.join(text)
+        if apply_case=="upper":
+            text=text.upper()
+        elif apply_case=="sentence":
+            text=text.capitalize()
+        elif apply_case=="title":
+            text=text.title()
+        return text 
+    text_field = text_field.apply(_apply)
+    df[output_column]=text_field
+    return df  
+    
\ No newline at end of file