capitalone
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 3 deletions b/‎.gitignore‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 13 additions & 10 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 13 additions & 10 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 1 deletion b/‎MANIFEST.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 27 additions & 9 deletions b/‎Makefile‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎dataprofiler/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎dataprofiler/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataprofiler/_typing.py‎
Lines changed: 3 additions & 2 deletions b/‎dataprofiler/_typing.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎dataprofiler/data_readers/avro_data.py‎
Lines changed: 11 additions & 10 deletions b/‎dataprofiler/data_readers/avro_data.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎dataprofiler/data_readers/base_data.py‎
Lines changed: 11 additions & 9 deletions b/‎dataprofiler/data_readers/base_data.py‎
Lines changed: 11 additions & 9 deletions
@@ -134,6 +134,3 @@ venv.bak/
 env3/
 
 *.bak
-
-#Pipfiles
-Pipfile*
@@ -2,7 +2,7 @@ repos:
   # Black: format Python code
   # https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml
   - repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 24.10.0
     hooks:
       - id: black
         types: [file, python]
@@ -19,7 +19,7 @@ repos:
   # Flake8: complexity and style checking
   # https://flake8.pycqa.org/en/latest/user/using-hooks.html
   - repo: https://github.com/pycqa/flake8
-    rev: 4.0.1
+    rev: 5.0.4
     hooks:
       - id: flake8
         additional_dependencies: [flake8-docstrings]
@@ -38,7 +38,7 @@ repos:
   # Mypy: Optional static type checking
   # https://github.com/pre-commit/mirrors-mypy
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v0.982
+    rev: v1.11.2
     hooks:
       - id: mypy
         exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
@@ -48,7 +48,7 @@ repos:
             # requirements.txt
             h5py>=2.10.0,
             wheel>=0.33.1,
-            numpy>=1.22.0,
+            numpy<2.0.0,
             pandas>=1.1.2,
             python-dateutil>=2.7.5,
             pytz>=2020.1,
@@ -80,7 +80,7 @@ repos:
 
             # requirements-ml.txt
             scikit-learn>=0.23.2,
-            'keras>=2.4.3,<3.0.0',
+            'keras>=2.4.3,<=3.4.0',
             rapidfuzz>=2.6.1,
             "tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
             "tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
@@ -108,16 +108,19 @@ repos:
     rev: "0.48"
     hooks:
       - id: check-manifest
-        additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas',
-        'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
-        'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
-        'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
+        additional_dependencies:
+          [
+            'matplotlib', 'h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
+            'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
+            'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
+            'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3',
+          ]
   # Pyupgrade - standardize and modernize Python syntax for newer versions of the language
   - repo: https://github.com/asottile/pyupgrade
     rev: v3.3.0
     hooks:
       - id: pyupgrade
-        args: ["--py38-plus"]
+        args: ["--py39-plus"]
   # Autoflake - cleanup unused variables and imports
   - repo: https://github.com/PyCQA/autoflake
     rev: v2.0.0
 
@@ -17,4 +17,4 @@ recursive-include resources *.json
 recursive-include resources *.pb
 recursive-include resources *.py
 
-recursive-include dataprofiler/labelers/embeddings/ *.txt
+recursive-include dataprofiler/labelers/embeddings/*.txt
@@ -1,12 +1,19 @@
-setup: requirements.txt requirements-dev.txt requirements-test.txt
-	python3 -m venv venv
+PYTHON_VERSION ?= python3.9
+VENV_DIR ?= venv
+REQ_FILES := requirements.txt requirements-dev.txt requirements-test.txt requirements-ml.txt requirements-reports.txt
 
-	. venv/bin/activate && \
-	pip3 install -r requirements.txt && \
-	pip3 install -r requirements-dev.txt && \
-	pip3 install -r requirements-ml.txt && \
-	pip3 install -r requirements-reports.txt && \
-	pip3 install -r requirements-test.txt && \
+check-python:
+	@$(PYTHON_VERSION) --version | grep -E "Python (3\.9|3\.10|3\.11)" || \
+	(echo "Python 3.9, 3.10, or 3.11 is required. Ensure $(PYTHON_VERSION) is installed and try again." && exit 1)
+
+setup: check-python $(REQ_FILES)
+	@$(PYTHON_VERSION) -m venv $(VENV_DIR)
+	. $(VENV_DIR)/bin/activate && \
+	pip3 install --no-cache-dir -r requirements-ml.txt && \
+	pip3 install --no-cache-dir -r requirements.txt && \
+	pip3 install --no-cache-dir -r requirements-dev.txt && \
+	pip3 install --no-cache-dir -r requirements-reports.txt && \
+	pip3 install --no-cache-dir -r requirements-test.txt && \
 	pip3 install -e . && \
 	pre-commit install && \
 	pre-commit run
@@ -15,4 +22,15 @@ format:
 	pre-commit run
 
 test:
-	DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py"
+	DATAPROFILER_SEED=0 $(VENV_DIR)/bin/python -m unittest discover -p "test*.py"
+
+clean:
+	rm -rf .pytest_cache __pycache__
+
+help:
+	@echo "Makefile Commands:"
+	@echo "  setup       - Set up the virtual environment with Python $(PYTHON_VERSION)"
+	@echo "  format      - Format the code using pre-commit hooks"
+	@echo "  test        - Run unit tests with unittest"
+	@echo "  clean       - Remove temporary files (caches), but keep the virtual environment"
+	@echo "  help        - Display this help message"
@@ -1,4 +1,5 @@
 """Package for dataprofiler."""
+
 from . import settings
 from .data_readers.data import Data
 from .dp_logging import get_logger, set_verbosity
 
@@ -1,9 +1,10 @@
 """Contains typing aliases."""
-from typing import Dict, List, NewType, Union
+
+from typing import NewType, Union
 
 import numpy as np
 import pandas as pd
 
 DataArray = Union[pd.DataFrame, pd.Series, np.ndarray]
-JSONType = Union[str, int, float, bool, None, List, Dict]
+JSONType = Union[str, int, float, bool, None, list, dict]
 Url = NewType("Url", str)
@@ -1,6 +1,7 @@
 """Contains class for saving and loading spreadsheet data."""
+
 from io import BytesIO, StringIO
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
 import fastavro
 
@@ -20,7 +21,7 @@ def __init__(
         self,
         input_file_path: Optional[str] = None,
         data: Optional[Any] = None,
-        options: Optional[Dict] = None,
+        options: Optional[dict] = None,
     ) -> None:
         """
         Initialize Data class for loading datasets of type AVRO.
@@ -60,22 +61,22 @@ def file_encoding(self, value: Any) -> None:
         """
         pass
 
-    def _load_data_from_file(self, input_file_path: str) -> List:
+    def _load_data_from_file(self, input_file_path: str) -> list:
         """Load data from file."""
         with FileOrBufferHandler(input_file_path, "rb") as input_file:
             # Currently, string reading with 'r' option has the unicode issue,
             # even when the option encoding='utf-8' is added. It may come from
             # some special compression codec, e.g., snappy. Then, binary mode
             # reading is currently used to get the dict-formatted lines.
             df_reader = fastavro.reader(input_file)
-            lines: List = list()
+            lines: list = list()
             for line in df_reader:
                 lines.append(line)
             return lines
 
     @classmethod
     def is_match(
-        cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None
+        cls, file_path: Union[str, StringIO, BytesIO], options: Optional[dict] = None
     ) -> bool:
         """
         Test the given file to check if the file has valid AVRO format or not.
@@ -103,7 +104,7 @@ def is_match(
         return is_valid_avro
 
     @classmethod
-    def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
+    def _get_nested_key(cls, dict_line: dict, nested_key: dict) -> dict:
         """
         Update nested keys from a dictionary and the current nested key.
 
@@ -131,7 +132,7 @@ def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
         return nested_key
 
     @classmethod
-    def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
+    def _get_nested_keys_from_dicts(cls, dicts: list[dict]) -> dict:
         """
         Extract nested keys from a list of dictionaries.
 
@@ -143,13 +144,13 @@ def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
         :type dicts: list(dict)
         :return: a dictionary containing nested keys
         """
-        nested_keys: Dict = {}
+        nested_keys: dict = {}
         for dict_line in dicts:
             nested_keys = cls._get_nested_key(dict_line, nested_keys)
         return nested_keys
 
     @classmethod
-    def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
+    def _get_schema_avro(cls, nested_keys: dict, schema_avro: dict) -> dict:
         """
         Update avro schema from the nested keys and the current avro schema.
 
@@ -190,7 +191,7 @@ def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
             if type(value) is dict:
                 # here, the null option to specify keys not required
                 # for every lines
-                schema_avro_temp: Dict[str, Any] = {
+                schema_avro_temp: dict[str, Any] = {
                     "name": key,
                     "type": [{"name": key, "type": "record", "fields": []}, "null"],
                 }
 
@@ -1,9 +1,11 @@
 """Contains abstract class for data loading and saving."""
+
 import locale
 import sys
 from collections import OrderedDict
+from collections.abc import Generator
 from io import StringIO
-from typing import Any, Dict, Generator, List, Optional, Union
+from typing import Any, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -21,7 +23,7 @@ class BaseData:
     info: Optional[str] = None
 
     def __init__(
-        self, input_file_path: Optional[str], data: Any, options: Dict
+        self, input_file_path: Optional[str], data: Any, options: dict
     ) -> None:
         """
         Initialize Base class for loading a dataset.
@@ -42,7 +44,7 @@ def __init__(
 
         # Public properties
         self.input_file_path = input_file_path
-        self.options: Optional[Dict] = options
+        self.options: Optional[dict] = options
 
         # 'Private' properties
         #  _data_formats: dict containing data_formats (key) and function
@@ -56,10 +58,10 @@ def __init__(
         #               constant across function calls.
         #  _tmp_file_name: randomly set variables for file name usable by system
         #  _file_encoding: contains the suggested file encoding for reading data
-        self._data_formats: Dict[str, Any] = OrderedDict()
+        self._data_formats: dict[str, Any] = OrderedDict()
         self._selected_data_format: Optional[str] = None
         self._data: Optional[Any] = data
-        self._batch_info: Dict = dict(perm=list(), iter=0)
+        self._batch_info: dict = dict(perm=list(), iter=0)
         self._tmp_file_name: Optional[str] = None
         self._file_encoding: Optional[str] = options.get("encoding", None)
 
@@ -137,7 +139,7 @@ def file_encoding(self, value: str) -> None:
         self._file_encoding = value
 
     @staticmethod
-    def _check_and_return_options(options: Optional[Dict]) -> Dict:
+    def _check_and_return_options(options: Optional[dict]) -> dict:
         """Return options or raise error."""
         if not options:
             options = dict()
@@ -151,7 +153,7 @@ def _load_data(self, data: Optional[Any] = None) -> None:
 
     def get_batch_generator(
         self, batch_size: int
-    ) -> Generator[Union[pd.DataFrame, List], None, None]:
+    ) -> Generator[Union[pd.DataFrame, list], None, None]:
         """Get batch generator."""
         data_length = len(self.data)
         indices = np.random.permutation(data_length)
@@ -162,12 +164,12 @@ def get_batch_generator(
                 yield list(self.data[k] for k in indices[i : i + batch_size])
 
     @classmethod
-    def is_match(cls, input_file_path: str, options: Optional[Dict]) -> bool:
+    def is_match(cls, input_file_path: str, options: Optional[dict]) -> bool:
         """Return true if match, false otherwise."""
         raise NotImplementedError()
 
     def reload(
-        self, input_file_path: Optional[str], data: Any, options: Optional[Dict]
+        self, input_file_path: Optional[str], data: Any, options: Optional[dict]
     ) -> None:
         """
         Reload the data class with a new dataset.
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""Package for dataprofiler."""`
	`2`	`+`
`2`	`3`	`from . import settings`
`3`	`4`	`from .data_readers.data import Data`
`4`	`5`	`from .dp_logging import get_logger, set_verbosity`