diff --git a/dskit/cleaning.py b/dskit/cleaning.py index 85b1e90..bb28ee5 100644 --- a/dskit/cleaning.py +++ b/dskit/cleaning.py @@ -113,6 +113,8 @@ def outlier_summary(df, method='iqr', threshold=1.5): elif method == 'zscore': mean = df[col].mean() std = df[col].std() + if std == 0: + continue z_scores = (df[col] - mean) / std outliers = df[np.abs(z_scores) > 3] @@ -139,6 +141,8 @@ def remove_outliers(df, method='iqr', threshold=1.5): elif method == 'zscore': mean = df[col].mean() std = df[col].std() + if std==0: + continue z_scores = (df[col] - mean) / std df = df[np.abs(z_scores) <= 3] diff --git a/dskit/core.py b/dskit/core.py index b400fcb..57725d7 100644 --- a/dskit/core.py +++ b/dskit/core.py @@ -1,7 +1,7 @@ import pandas as pd from . import io, cleaning, visualization, preprocessing, modeling, explainability, eda from . import feature_engineering, nlp_utils, advanced_visualization, advanced_modeling, auto_ml, comprehensive_eda - +from typing import Literal,Optional,Annotated class dskit: def __init__(self, df=None): self.df = df @@ -17,8 +17,8 @@ def load(filepath): return dskit(io.load(filepath)) @staticmethod - def read_folder(folder_path, file_type='csv'): - return dskit(io.read_folder(folder_path, file_type)) + def read_folder(folder_path:str, file_type:Literal['csv','xls','xlsx','json','parquet']='csv',dynamic:bool=False,display_ignored:bool=False): + return dskit(io.read_folder(folder_path, file_type,dynamic,display_ignored)) def save(self, filepath, **kwargs): io.save(self.df, filepath, **kwargs) @@ -73,7 +73,47 @@ def text_stats(self, text_cols=None): def generate_wordcloud(self, text_col, max_words=100): nlp_utils.generate_wordcloud(self.df, text_col, max_words) return self - + def generate_vocabulary(self,text_col:str,case:Literal['lower','upper']=None): + return nlp_utils.generate_vocabulary(self.df,text_col,case) + def apply_nltk( + self, + text_column:Annotated[str, "Column name containing raw text"], + output_column:Annotated[str, "Output column name for processed text"] = "cleaned_nltk", + apply_case:Annotated[ + Optional[Literal['lower','upper','sentence','title']], + "Case transformation to apply" + ] = 'lower', + allow_download:Annotated[bool,"Automatically download required NLTK resources if missing"]=False, + remove_stopwords:Annotated[bool,"Remove stopwords using NLTK stopword corpus"]=True, + keep_words:Annotated[ + list[str], + "Words to retain even if stopword removal is enabled" + ] = ["not", "no", "off"], + remove_words:Annotated[ + list[str], + "Words to explicitly remove from the text" + ]=[], + use_tokenizer:Annotated[bool,"Use NLTK tokenizer instead of simple whitespace split"]=True, + language:Annotated[str,"Language for stopword removal"]='english', + canonicalization:Annotated[ + Optional[Literal['stemming', 'lemmatization']], + "Canonicalization strategy" + ]='stemming' + )->pd.DataFrame: + self.df = nlp_utils.apply_nltk( + df=self.df, + text_column=text_column, + output_column=output_column, + apply_case=apply_case, + allow_download=allow_download, + remove_stopwords=remove_stopwords, + keep_words=keep_words, + remove_words=remove_words, + use_tokenizer=use_tokenizer, + language=language, + canonicalization=canonicalization + ) + return self def clean(self): """ Chains multiple cleaning steps: fix_dtypes -> rename_columns -> fill_missing diff --git a/dskit/io.py b/dskit/io.py index 8e8c9c7..818e1e5 100644 --- a/dskit/io.py +++ b/dskit/io.py @@ -27,25 +27,49 @@ def load(filepath): print(f"Error loading file: {e}") return None -def read_folder(folder_path, file_type='csv'): +def read_folder(folder_path:str, file_type:str='csv',dynamic:bool=False,display_ignored:bool=False): """ - Loads multiple files from a folder and concatenates them. + Load and concatenate tabular files from a folder. + Parameters + ---------- + folder_path : str + Path to the directory containing the files to be loaded. + + file_type : str, default='csv' + File extension to filter files when `dynamic=False`. + Example: 'csv', 'xlsx', 'parquet'. + + dynamic : bool, default=False + If True, loads all files regardless of extension. + If False, only files matching `file_type` are loaded. + + display_ignored : bool, default=False + If True, prints the list of files that were skipped + because they could not be loaded by the `load()` function. """ if not os.path.exists(folder_path): raise FileNotFoundError(f"The folder '{folder_path}' was not found.") - - all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}")) - - if not all_files: - print(f"No files found with extension .{file_type} in {folder_path}") - return None + if dynamic: + all_files = glob.glob(os.path.join(folder_path, "*.*")) + if not all_files: + print(f"No files found!") + return None + else: + all_files = glob.glob(os.path.join(folder_path, f"*.{file_type}")) + if not all_files: + print(f"No files found with extension .{file_type} in {folder_path}") + return None df_list = [] + ignored=[] for filename in all_files: df = load(filename) if df is not None: df_list.append(df) - + else: + ignored.append(filename) + if display_ignored: + print("Ignored Files : ","\n".join(ignored)) if df_list: return pd.concat(df_list, ignore_index=True) else: diff --git a/dskit/nlp_utils.py b/dskit/nlp_utils.py index 42a56dc..99f6f19 100644 --- a/dskit/nlp_utils.py +++ b/dskit/nlp_utils.py @@ -1,16 +1,19 @@ import pandas as pd import numpy as np +from typing import Literal, Optional, Iterable, Callable +from collections import OrderedDict +import re +import warnings +import importlib +import string +import matplotlib.pyplot as plt try: - import nltk from textblob import TextBlob from wordcloud import WordCloud except ImportError: - nltk = None TextBlob = None WordCloud = None -import re -import string -import matplotlib.pyplot as plt + def basic_text_stats(df, text_cols=None): """ @@ -230,4 +233,267 @@ def detect_language(df, text_col): languages.append('unknown') df[f'{text_col}_language'] = languages - return df \ No newline at end of file + return df + + +def generate_vocabulary(df:pd.DataFrame,text_col:str,case:Literal['lower','upper']=None): + """ + returns a list of vocabulary made from a column of dataframe + + :param df: dataframe + :type df: pd.DataFrame + :param text_col: name of the text column + :type text_col: str + :param case: case of text. If not provided then words remains unchanged + :type case: Literal['lower', 'upper'] + """ + if text_col not in df.columns: + print(f"Column '{text_col}' not found.") + return [] + vocabulary = set() + for text in df[text_col].astype(str): + if case=='lower': + text=text.lower() + elif case=='upper': + text==text.upper() + text = text.split() + for t in text: + vocabulary.add(t) + return list(vocabulary) + + + +# nltk application +_init_nltk_cache=OrderedDict() +_MAX_CACHE_SIZE = 2 +def _get_from_cache(key): + try: + value = _init_nltk_cache.pop(key) + _init_nltk_cache[key] = value # move to end + return value + except KeyError: + return None + +def _set_cache(key, value): + if key in _init_nltk_cache: + _init_nltk_cache.pop(key) + elif len(_init_nltk_cache) >= _MAX_CACHE_SIZE: + _init_nltk_cache.popitem(last=False) # evict LRU + _init_nltk_cache[key] = value + + +class NLTKUnavailable(Exception): + pass + + +def _init_nltk(download_list:Optional[Iterable[str]]=None,allow_download: bool = False, language: str = "english",keep_words:Optional[Iterable[str]]=None,remove_words:Optional[Iterable[str]]=None)->dict[str,object]: + """ + Lazy initialize and return a dict with objects: tokenizer, stopwords_set, stemmer/lemmatizer. + Raises NLTKUnavailable if nltk not present and allow_download is False. + """ + env=None + download_tuple = tuple(sorted([d for d in (download_list or []) if d])) + keep_tuple = tuple(sorted([k for k in (keep_words or []) if k])) + remove_tuple = tuple(sorted([r for r in (keep_words or []) if r])) + key = (download_tuple, bool(allow_download), language, keep_tuple, remove_tuple) + env=_get_from_cache(key) + if env is not None: + return env + + try: + nltk = importlib.import_module("nltk") + except ImportError: + raise NLTKUnavailable("nltk is not installed") + + # helper to check resource and optionally download + def _ensure(resource_name: str, download_name: Optional[str] = None): + try: + nltk.data.find(resource_name) + except LookupError: + if allow_download: + download_target = download_name or resource_name + nltk.download(download_target, quiet=True) + else: + raise LookupError(f"NLTK resource '{resource_name}' not found. Set allow_download=True or install resources manually.") + + tokenizer=None + stopwords_set = None + stemmer = None + lemmatizer = None + # punkt is used by word_tokenize + if 'tokenizer' in download_list: + _ensure("tokenizers/punkt", "punkt") + tokenizer = nltk.word_tokenize + + + + if 'stopwords' in download_list: + _ensure("corpora/stopwords", "stopwords") + from nltk.corpus import stopwords as _nltk_stopwords + stopwords_set = set(_nltk_stopwords.words(language)) + if isinstance(keep_words,(list,tuple)) and len(keep_words)>0: + for kword in keep_words: + stopwords_set.discard(kword) + if isinstance(remove_words,(list,tuple)) and len(remove_words)>0: + for rword in remove_words: + stopwords_set.add(rword) + + + + # Initialize both, user picks which to use + if 'stemming' in download_list: + from nltk.stem import PorterStemmer + stemmer = PorterStemmer() + if 'lemmatization' in download_list: + _ensure("corpora/wordnet", "wordnet") + from nltk.stem import WordNetLemmatizer + lemmatizer = WordNetLemmatizer() + env = { + "nltk": nltk, + "tokenizer": tokenizer, + "stopwords": stopwords_set, + "stemmer": stemmer, + "lemmatizer": lemmatizer, + } + _set_cache(key,env) + return env +def apply_nltk( + df:pd.DataFrame, + text_column:str, + output_column:str="claeaned_nltk", + apply_case:Optional[Literal['lower','upper','sentence','title']]=None, + allow_download:bool=False, + remove_stopwords:bool=False, + keep_words:list=["not","no","off"], + remove_words:list=[], + use_tokenizer:bool=False, + language:str='english', + canonicalization:Optional[Literal['stemming', 'lemmatization']]=None + )->pd.DataFrame: + """ + Apply advanced text preprocessing using optional NLTK-based features. + + This function performs configurable text normalization on a specified + DataFrame column, including case transformation, tokenization, + stopword removal, and canonicalization (stemming or lemmatization). + NLTK is treated as an optional dependency and is only initialized + when explicitly required by the chosen options. + + Parameters + ---------- + df : pandas.DataFrame + Input DataFrame containing the text data. + + text_column : str + Name of the column in `df` that contains text to be processed. + + output_column : str, default="claeaned_nltk" + Name of the output column where the processed text will be stored. + + apply_case : {"lower", "upper", "sentence", "title"}, optional + Case transformation to apply to the text: + + allow_download : bool, default=False + If True, automatically downloads required NLTK resources + (e.g., punkt, stopwords, wordnet) when missing. + If False, missing resources will raise an error. + + remove_stopwords : bool, default=False + If True, removes stopwords using the specified language. + + keep_words : list of str, default=["not", "no", "off"] + Words that should be retained even if stopword removal is enabled. + Useful for preserving negations. + + remove_words : list of str, default=[] + Explicit list of words to remove from the text regardless + of stopword settings. + + use_tokenizer : bool, default=False + If True, uses NLTK's tokenizer (requires `punkt`). + If False, falls back to a lightweight whitespace-based tokenizer. + + language : str, default="english" + Language used for stopword removal. + + canonicalization : {"stemming", "lemmatization"}, optional + Word normalization strategy: + - "stemming" : applies Porter stemming + - "lemmatization" : applies WordNet lemmatization + + Returns + ------- + pandas.DataFrame + A copy of the input DataFrame with an additional column + containing the processed text. + + Raises + ------ + KeyError + If `text_column` does not exist in the DataFrame. + + ImportError + If NLTK is required but not installed. + + LookupError + If required NLTK resources are missing and `allow_download=False`. + + Notes + ----- + - NLTK initialization and resource loading are cached internally + to avoid repeated overhead across multiple calls. + + """ + if text_column not in df.columns: + raise IndexError(f"Column '{text_column}' not found.") + df=df.copy() + text_field = df[text_column].astype('str') + tokenizer: Callable[[str], list[str]] = None + stopwords_set: Optional[set[str]] = set() + stemmer = None + lemmatizer = None + download_list = [] + if canonicalization: + download_list.append(canonicalization) + if remove_stopwords: + download_list.append('stopwords') + if use_tokenizer: + download_list.append('tokenizer') + try: + env = _init_nltk(download_list=download_list,allow_download=allow_download, language=language,keep_words=keep_words,remove_words=remove_words) + tokenizer = env["tokenizer"] + stopwords_set = env["stopwords"] + stemmer = env["stemmer"] + lemmatizer = env["lemmatizer"] + except (NLTKUnavailable, LookupError) as e: + warnings.warn(f"NLTK unavailable or missing corpora: Falling back to lightweight tokenizer. Set allow_download=True to auto-download resources. {e}", UserWarning,stacklevel=2) + + + def _apply(text): + text=text.lower() + if use_tokenizer: + text = tokenizer(text) + else: + text = text.split() + + if canonicalization == 'stemming': + text = [stemmer.stem(word) for word in text if word not in stopwords_set] + elif canonicalization == 'lemmatization': + text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords_set] + else: + warnings.warn("No canonicalization used",UserWarning,stacklevel=2) + if remove_stopwords: + text = [word for word in text if word not in stopwords_set] + + text =' '.join(text) + if apply_case=="upper": + text=text.upper() + elif apply_case=="sentence": + text=text.capitalize() + elif apply_case=="title": + text=text.title() + return text + text_field = text_field.apply(_apply) + df[output_column]=text_field + return df + \ No newline at end of file