Skip to content

Commit 349db2a

Browse files
authored
Merge branch 'dev' into testinit1
2 parents 6a17708 + 96febab commit 349db2a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+433
-306
lines changed

.gitignore

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,3 @@ venv.bak/
134134
env3/
135135

136136
*.bak
137-
138-
#Pipfiles
139-
Pipfile*

.pre-commit-config.yaml

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ repos:
22
# Black: format Python code
33
# https://github.com/psf/black/blob/master/.pre-commit-hooks.yaml
44
- repo: https://github.com/psf/black
5-
rev: 22.3.0
5+
rev: 24.10.0
66
hooks:
77
- id: black
88
types: [file, python]
@@ -19,7 +19,7 @@ repos:
1919
# Flake8: complexity and style checking
2020
# https://flake8.pycqa.org/en/latest/user/using-hooks.html
2121
- repo: https://github.com/pycqa/flake8
22-
rev: 4.0.1
22+
rev: 5.0.4
2323
hooks:
2424
- id: flake8
2525
additional_dependencies: [flake8-docstrings]
@@ -38,7 +38,7 @@ repos:
3838
# Mypy: Optional static type checking
3939
# https://github.com/pre-commit/mirrors-mypy
4040
- repo: https://github.com/pre-commit/mirrors-mypy
41-
rev: v0.982
41+
rev: v1.11.2
4242
hooks:
4343
- id: mypy
4444
exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
@@ -48,7 +48,7 @@ repos:
4848
# requirements.txt
4949
h5py>=2.10.0,
5050
wheel>=0.33.1,
51-
numpy>=1.22.0,
51+
numpy<2.0.0,
5252
pandas>=1.1.2,
5353
python-dateutil>=2.7.5,
5454
pytz>=2020.1,
@@ -80,7 +80,7 @@ repos:
8080

8181
# requirements-ml.txt
8282
scikit-learn>=0.23.2,
83-
'keras>=2.4.3,<3.0.0',
83+
'keras>=2.4.3,<=3.4.0',
8484
rapidfuzz>=2.6.1,
8585
"tensorflow>=2.6.4,<2.15.0; sys.platform != 'darwin'",
8686
"tensorflow>=2.6.4,<2.15.0; sys_platform == 'darwin' and platform_machine != 'arm64'",
@@ -108,16 +108,19 @@ repos:
108108
rev: "0.48"
109109
hooks:
110110
- id: check-manifest
111-
additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas',
112-
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
113-
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
114-
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3']
111+
additional_dependencies:
112+
[
113+
'matplotlib', 'h5py', 'wheel', 'future', 'numpy<2.0.0', 'pandas',
114+
'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro',
115+
'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests',
116+
'networkx','typing-extensions', 'HLL', 'datasketches', 'boto3',
117+
]
115118
# Pyupgrade - standardize and modernize Python syntax for newer versions of the language
116119
- repo: https://github.com/asottile/pyupgrade
117120
rev: v3.3.0
118121
hooks:
119122
- id: pyupgrade
120-
args: ["--py38-plus"]
123+
args: ["--py39-plus"]
121124
# Autoflake - cleanup unused variables and imports
122125
- repo: https://github.com/PyCQA/autoflake
123126
rev: v2.0.0

MANIFEST.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ recursive-include resources *.json
1717
recursive-include resources *.pb
1818
recursive-include resources *.py
1919

20-
recursive-include dataprofiler/labelers/embeddings/ *.txt
20+
recursive-include dataprofiler/labelers/embeddings/*.txt

Makefile

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,19 @@
1-
setup: requirements.txt requirements-dev.txt requirements-test.txt
2-
python3 -m venv venv
1+
PYTHON_VERSION ?= python3.9
2+
VENV_DIR ?= venv
3+
REQ_FILES := requirements.txt requirements-dev.txt requirements-test.txt requirements-ml.txt requirements-reports.txt
34

4-
. venv/bin/activate && \
5-
pip3 install -r requirements.txt && \
6-
pip3 install -r requirements-dev.txt && \
7-
pip3 install -r requirements-ml.txt && \
8-
pip3 install -r requirements-reports.txt && \
9-
pip3 install -r requirements-test.txt && \
5+
check-python:
6+
@$(PYTHON_VERSION) --version | grep -E "Python (3\.9|3\.10|3\.11)" || \
7+
(echo "Python 3.9, 3.10, or 3.11 is required. Ensure $(PYTHON_VERSION) is installed and try again." && exit 1)
8+
9+
setup: check-python $(REQ_FILES)
10+
@$(PYTHON_VERSION) -m venv $(VENV_DIR)
11+
. $(VENV_DIR)/bin/activate && \
12+
pip3 install --no-cache-dir -r requirements-ml.txt && \
13+
pip3 install --no-cache-dir -r requirements.txt && \
14+
pip3 install --no-cache-dir -r requirements-dev.txt && \
15+
pip3 install --no-cache-dir -r requirements-reports.txt && \
16+
pip3 install --no-cache-dir -r requirements-test.txt && \
1017
pip3 install -e . && \
1118
pre-commit install && \
1219
pre-commit run
@@ -15,4 +22,15 @@ format:
1522
pre-commit run
1623

1724
test:
18-
DATAPROFILER_SEED=0 python3 -m unittest discover -p "test*.py"
25+
DATAPROFILER_SEED=0 $(VENV_DIR)/bin/python -m unittest discover -p "test*.py"
26+
27+
clean:
28+
rm -rf .pytest_cache __pycache__
29+
30+
help:
31+
@echo "Makefile Commands:"
32+
@echo " setup - Set up the virtual environment with Python $(PYTHON_VERSION)"
33+
@echo " format - Format the code using pre-commit hooks"
34+
@echo " test - Run unit tests with unittest"
35+
@echo " clean - Remove temporary files (caches), but keep the virtual environment"
36+
@echo " help - Display this help message"

dataprofiler/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Package for dataprofiler."""
2+
23
from . import settings
34
from .data_readers.data import Data
45
from .dp_logging import get_logger, set_verbosity

dataprofiler/_typing.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
"""Contains typing aliases."""
2-
from typing import Dict, List, NewType, Union
2+
3+
from typing import NewType, Union
34

45
import numpy as np
56
import pandas as pd
67

78
DataArray = Union[pd.DataFrame, pd.Series, np.ndarray]
8-
JSONType = Union[str, int, float, bool, None, List, Dict]
9+
JSONType = Union[str, int, float, bool, None, list, dict]
910
Url = NewType("Url", str)

dataprofiler/data_readers/avro_data.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Contains class for saving and loading spreadsheet data."""
2+
23
from io import BytesIO, StringIO
3-
from typing import Any, Dict, List, Optional, Union
4+
from typing import Any, Optional, Union
45

56
import fastavro
67

@@ -20,7 +21,7 @@ def __init__(
2021
self,
2122
input_file_path: Optional[str] = None,
2223
data: Optional[Any] = None,
23-
options: Optional[Dict] = None,
24+
options: Optional[dict] = None,
2425
) -> None:
2526
"""
2627
Initialize Data class for loading datasets of type AVRO.
@@ -60,22 +61,22 @@ def file_encoding(self, value: Any) -> None:
6061
"""
6162
pass
6263

63-
def _load_data_from_file(self, input_file_path: str) -> List:
64+
def _load_data_from_file(self, input_file_path: str) -> list:
6465
"""Load data from file."""
6566
with FileOrBufferHandler(input_file_path, "rb") as input_file:
6667
# Currently, string reading with 'r' option has the unicode issue,
6768
# even when the option encoding='utf-8' is added. It may come from
6869
# some special compression codec, e.g., snappy. Then, binary mode
6970
# reading is currently used to get the dict-formatted lines.
7071
df_reader = fastavro.reader(input_file)
71-
lines: List = list()
72+
lines: list = list()
7273
for line in df_reader:
7374
lines.append(line)
7475
return lines
7576

7677
@classmethod
7778
def is_match(
78-
cls, file_path: Union[str, StringIO, BytesIO], options: Optional[Dict] = None
79+
cls, file_path: Union[str, StringIO, BytesIO], options: Optional[dict] = None
7980
) -> bool:
8081
"""
8182
Test the given file to check if the file has valid AVRO format or not.
@@ -103,7 +104,7 @@ def is_match(
103104
return is_valid_avro
104105

105106
@classmethod
106-
def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
107+
def _get_nested_key(cls, dict_line: dict, nested_key: dict) -> dict:
107108
"""
108109
Update nested keys from a dictionary and the current nested key.
109110
@@ -131,7 +132,7 @@ def _get_nested_key(cls, dict_line: Dict, nested_key: Dict) -> Dict:
131132
return nested_key
132133

133134
@classmethod
134-
def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
135+
def _get_nested_keys_from_dicts(cls, dicts: list[dict]) -> dict:
135136
"""
136137
Extract nested keys from a list of dictionaries.
137138
@@ -143,13 +144,13 @@ def _get_nested_keys_from_dicts(cls, dicts: List[Dict]) -> Dict:
143144
:type dicts: list(dict)
144145
:return: a dictionary containing nested keys
145146
"""
146-
nested_keys: Dict = {}
147+
nested_keys: dict = {}
147148
for dict_line in dicts:
148149
nested_keys = cls._get_nested_key(dict_line, nested_keys)
149150
return nested_keys
150151

151152
@classmethod
152-
def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
153+
def _get_schema_avro(cls, nested_keys: dict, schema_avro: dict) -> dict:
153154
"""
154155
Update avro schema from the nested keys and the current avro schema.
155156
@@ -190,7 +191,7 @@ def _get_schema_avro(cls, nested_keys: Dict, schema_avro: Dict) -> Dict:
190191
if type(value) is dict:
191192
# here, the null option to specify keys not required
192193
# for every lines
193-
schema_avro_temp: Dict[str, Any] = {
194+
schema_avro_temp: dict[str, Any] = {
194195
"name": key,
195196
"type": [{"name": key, "type": "record", "fields": []}, "null"],
196197
}

dataprofiler/data_readers/base_data.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
"""Contains abstract class for data loading and saving."""
2+
23
import locale
34
import sys
45
from collections import OrderedDict
6+
from collections.abc import Generator
57
from io import StringIO
6-
from typing import Any, Dict, Generator, List, Optional, Union
8+
from typing import Any, Optional, Union
79

810
import numpy as np
911
import pandas as pd
@@ -21,7 +23,7 @@ class BaseData:
2123
info: Optional[str] = None
2224

2325
def __init__(
24-
self, input_file_path: Optional[str], data: Any, options: Dict
26+
self, input_file_path: Optional[str], data: Any, options: dict
2527
) -> None:
2628
"""
2729
Initialize Base class for loading a dataset.
@@ -42,7 +44,7 @@ def __init__(
4244

4345
# Public properties
4446
self.input_file_path = input_file_path
45-
self.options: Optional[Dict] = options
47+
self.options: Optional[dict] = options
4648

4749
# 'Private' properties
4850
# _data_formats: dict containing data_formats (key) and function
@@ -56,10 +58,10 @@ def __init__(
5658
# constant across function calls.
5759
# _tmp_file_name: randomly set variables for file name usable by system
5860
# _file_encoding: contains the suggested file encoding for reading data
59-
self._data_formats: Dict[str, Any] = OrderedDict()
61+
self._data_formats: dict[str, Any] = OrderedDict()
6062
self._selected_data_format: Optional[str] = None
6163
self._data: Optional[Any] = data
62-
self._batch_info: Dict = dict(perm=list(), iter=0)
64+
self._batch_info: dict = dict(perm=list(), iter=0)
6365
self._tmp_file_name: Optional[str] = None
6466
self._file_encoding: Optional[str] = options.get("encoding", None)
6567

@@ -137,7 +139,7 @@ def file_encoding(self, value: str) -> None:
137139
self._file_encoding = value
138140

139141
@staticmethod
140-
def _check_and_return_options(options: Optional[Dict]) -> Dict:
142+
def _check_and_return_options(options: Optional[dict]) -> dict:
141143
"""Return options or raise error."""
142144
if not options:
143145
options = dict()
@@ -151,7 +153,7 @@ def _load_data(self, data: Optional[Any] = None) -> None:
151153

152154
def get_batch_generator(
153155
self, batch_size: int
154-
) -> Generator[Union[pd.DataFrame, List], None, None]:
156+
) -> Generator[Union[pd.DataFrame, list], None, None]:
155157
"""Get batch generator."""
156158
data_length = len(self.data)
157159
indices = np.random.permutation(data_length)
@@ -162,12 +164,12 @@ def get_batch_generator(
162164
yield list(self.data[k] for k in indices[i : i + batch_size])
163165

164166
@classmethod
165-
def is_match(cls, input_file_path: str, options: Optional[Dict]) -> bool:
167+
def is_match(cls, input_file_path: str, options: Optional[dict]) -> bool:
166168
"""Return true if match, false otherwise."""
167169
raise NotImplementedError()
168170

169171
def reload(
170-
self, input_file_path: Optional[str], data: Any, options: Optional[Dict]
172+
self, input_file_path: Optional[str], data: Any, options: Optional[dict]
171173
) -> None:
172174
"""
173175
Reload the data class with a new dataset.

0 commit comments

Comments
 (0)