-
Notifications
You must be signed in to change notification settings - Fork 218
269 fea validator - add validator code. #652
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
25b35ef
e08bc37
411e900
644e641
9b5c887
8702367
1ce5345
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| from graphistry.validate.validate_graph import validate_graph | ||
| import graphistry | ||
| import pandas as pd | ||
|
|
||
|
|
||
| def test_validate_graph_good(): | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') | ||
| assert (validate_graph(g) is True) | ||
|
|
||
|
|
||
| def test_validate_graph_undefined_nodeid(): | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': ['a', 'b', 'c'], 'name': ['A', 'B', 'C']})) | ||
| assert (validate_graph(g) is False) | ||
|
|
||
|
|
||
| def test_validate_graph_duplicate_nodeid(): | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': ['a','a', 'b', 'c'], 'name': ['A','A2', 'B', 'C']}), node='id') | ||
| assert (validate_graph(g) is False) | ||
|
|
||
|
|
||
| def test_validate_graph_missing_nodes(): | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']})) | ||
| assert (validate_graph(g) is False) | ||
|
|
||
|
|
||
| def test_validate_graph_nan_nodes(): | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': [None, 'b', 'c'], 'name': ['A', 'B', 'C']}), node='id') | ||
| assert (validate_graph(g) is False) | ||
|
|
||
|
|
||
| def test_validate_graph_missing_src_node(): | ||
| # Only returns warning | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': ['b', 'c'], 'name': ['B', 'C']}), node='id') | ||
| assert (validate_graph(g) is True) | ||
|
|
||
|
|
||
| def test_validate_graph_missing_dst_node(): | ||
| # Only returns warning | ||
| g = graphistry.edges(pd.DataFrame({'s': ['a', 'b'], 'd': ['b', 'c']}), 's', 'd').nodes( | ||
| pd.DataFrame({'id': ['a','b', ], 'name': ['A', 'B']}), node='id') | ||
| assert (validate_graph(g) is True) |
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
| @@ -0,0 +1,61 @@ | ||||
| def check_node_dataframe_exists(g, verbose=True): | ||||
| if g._nodes is None: | ||||
| if verbose: | ||||
| print("Warning: graph was created with only edges. Skipping Node ID check if Node IDs match edge IDs. Use g2 = g.materialize_nodes() to force node df creation. Exiting.") | ||||
| return False | ||||
| return True | ||||
|
|
||||
|
|
||||
| def check_node_id_defined(g, verbose=True): | ||||
| if g._node is None: | ||||
| if verbose: | ||||
| print("Invalid graph: Missing Node ID. Did you forget to specify the node ID in the .nodes() function? Exiting.") | ||||
| return False | ||||
| return True | ||||
|
|
||||
|
|
||||
| def check_nan_node_ids(g, verbose=True): | ||||
| if g._nodes[g._node].isnull().any(): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. compute & report a more verbose mode might return the df to help save time |
||||
| if verbose: | ||||
| print("Invalid graph: Contains NaN Node IDs.") | ||||
| return False | ||||
| return True | ||||
|
|
||||
|
|
||||
| def check_duplicate_node_ids(g, verbose=True): | ||||
| if g._nodes[g._node].duplicated().any(): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
| if verbose: | ||||
| print("Invalid graph: Contains duplicate Node IDs.") | ||||
| return False | ||||
| return True | ||||
|
|
||||
|
|
||||
| def check_edge_sources_exist_in_nodes(g, verbose=True): | ||||
| if not g._edges[g._source].isin(g._nodes[g._node]).all(): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
| if verbose: | ||||
| print("Warning: Contains source edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") | ||||
| return True | ||||
|
|
||||
|
|
||||
| def check_edge_destinations_exist_in_nodes(g, verbose=True): | ||||
| if not g._edges[g._destination].isin(g._nodes[g._node]).all(): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
| if verbose: | ||||
| print("Warning: Contains destination edge IDs that do not exist in the node DataFrame. This can cause unexpected results.") | ||||
| return True | ||||
|
|
||||
|
|
||||
| def validate_graph(g, verbose=True): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for validators used for debugging, instead of doing an immediate error, good to collect errors and report them all, with failure only at the end
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as a public function, should have a docstr matching other docstrs format
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. recommend also checking that dtypes of src/dst match, as does node/src if exist type mismatch is more of a warning vs a fail, as we can support bipartite graphs, like imagine
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. likewise, as a warn vs fail, if there is no overlap of src/dst edges, they may have a mismatch needing cleanup, imagine |
||||
| if not check_node_dataframe_exists(g, verbose): | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. node existence should be more like:
|
||||
| return False | ||||
| if not check_node_id_defined(g, verbose): | ||||
| return False | ||||
| if not check_nan_node_ids(g, verbose): | ||||
| return False | ||||
| if not check_duplicate_node_ids(g, verbose): | ||||
| return False | ||||
| check_edge_sources_exist_in_nodes(g, verbose) # Warnings only | ||||
| check_edge_destinations_exist_in_nodes(g, verbose) # Warnings only | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not returning
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, I guess I was considering only graph with edge IDs that could be valid. However, to your point, we should error if g._node and g._nodes exist and there are missing node IDs from g._edges src/dst, that should throw an error. |
||||
|
|
||||
| if verbose: | ||||
| print("Graph is valid.") | ||||
| return True | ||||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
switch from
printstatements towarn+logger.infodepending on useremoves need for
verboseparam via Pythonic conventions