diff --git a/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py b/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py index 45df90f849..8d05127f3d 100644 --- a/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py +++ b/scripts/us_census/acs5yr/subject_tables/common/generate_col_map.py @@ -166,23 +166,38 @@ def __init__(self, spec_dict={}, column_list=[], delimiter='!!'): def _find_and_replace_column_names(self, column): """ - if spec has find_and_replace defined, this function updates column names + Final robust version: Handles long keys containing delimiters + and multiple individual token replacements. """ - if 'find_and_replace' in self.features['preprocess']: - find_and_replace_dict = self.features['preprocess'][ - 'find_and_replace'] - # replace entire column name - if column in find_and_replace_dict: - return find_and_replace_dict[column] - # replace a token in the column name - else: - # TODO: Support the find_and_replace of more than one token - part_list = column.split(self.delimiter) - for idx, part in enumerate(part_list): - if part in find_and_replace_dict: - part_list[idx] = find_and_replace_dict[part] - return self.delimiter.join(part_list) - return column + if 'find_and_replace' not in self.features.get('preprocess', {}): + return column + + find_and_replace_dict = self.features['preprocess']['find_and_replace'] + new_column = column + + # 1. Handle Long Keys/Partial Strings (Most likely fix for your JSON) + # We sort by length (longest first) so we don't accidentally replace + # a small part of a larger key. + sorted_keys = sorted(find_and_replace_dict.keys(), key=len, reverse=True) + + for key in sorted_keys: + if key in new_column: + new_column = new_column.replace(key, find_and_replace_dict[key]) + + # 2. Token-based replacement (as a backup for exact token matches) + # This ensures that if 'INCOME' is a key, it only matches 'INCOME' + # and not 'INCOMES' + parts = new_column.split(self.delimiter) + modified_tokens = False + for idx, part in enumerate(parts): + if part in find_and_replace_dict: + parts[idx] = find_and_replace_dict[part] + modified_tokens = True + + if modified_tokens: + new_column = self.delimiter.join(parts) + + return new_column def _generate_stat_vars_from_spec(self): """generates stat_var nodes for each column in column list and @@ -203,23 +218,16 @@ def _generate_stat_vars_from_spec(self): # len((set(self.features['ignoreColumns']) & # set(col.split(self.delimiter)) > 0: for col in self.column_list: - # TODO: Replace the type of ignore_token_count to boolean - ignore_token_count = 0 - for part in col.split(self.delimiter): - for token in self.features['ignoreColumns']: - if part == token: - ignore_token_count = 1 - if token == col: - ignore_token_count = 1 - - # if no tokens of the columns are in ignoreColumns of the spec - if ignore_token_count == 0: + # Check if any string in ignoreColumns exists within the current header + is_ignored = False + for ignore_pattern in self.features['ignoreColumns']: + if ignore_pattern in col: + is_ignored = True + break + + # If not ignored, proceed to find_and_replace and statvar generation + if not is_ignored: renamed_col = self._find_and_replace_column_names(col) - # TODO: Before calling the column_to_statvar method, - # remove the base class or generalization token in the - # column name from the enumSpecialization section of the - # spec. - # TODO: Should we generate an error _column_to_statvar() returns an empty statvar? self.column_map[col] = self._column_to_statvar(renamed_col) # TODO: Deprecate this function, since enumSpecialization are used to diff --git a/scripts/us_census/api_utils/census_api_data_downloader.py b/scripts/us_census/api_utils/census_api_data_downloader.py index 7a4f812f0d..6289c62b96 100644 --- a/scripts/us_census/api_utils/census_api_data_downloader.py +++ b/scripts/us_census/api_utils/census_api_data_downloader.py @@ -34,10 +34,9 @@ module_dir_ = os.path.dirname(os.path.realpath(__file__)) path.insert(1, os.path.join(module_dir_, '../../../')) - -from .download_utils import download_url_list_iterations +from download_utils import download_url_list_iterations from tools.download_utils.requests_wrappers import request_url_json -from .status_file_utils import sync_status_list +from status_file_utils import sync_status_list FLAGS = flags.FLAGS @@ -165,6 +164,7 @@ def download_table(dataset: str, url_list = get_table_url_list(dataset, table_id, q_variable, year_list, output_path, api_key, s_level_list, force_fetch_config, force_fetch_data) + print(url_list) status_path = os.path.join(output_path, 'download_status.json') @@ -292,7 +292,11 @@ def consolidate_files(dataset: str, df = pd.DataFrame() for csv_file in csv_files_list[year]: cur_csv_path = os.path.join(output_path, csv_file) - df2 = pd.read_csv(cur_csv_path, low_memory=False) + try: + df2 = pd.read_csv(cur_csv_path, low_memory=False) + except pd.errors.EmptyDataError: + logging.warning('Skipping empty file: %s', cur_csv_path) + continue print("Collecting", csv_file) # remove extra columns drop_list = [] @@ -451,8 +455,10 @@ def download_table_variables(dataset, table_id, year_list, geo_url_map_path, def main(argv): year_list_int = list(range(FLAGS.start_year, FLAGS.end_year + 1)) + print("#########################",year_list_int) year_list = [str(y) for y in year_list_int] out_path = os.path.expanduser(FLAGS.output_path) + print("#####",FLAGS.summary_levels) if FLAGS.summary_levels: s_list = FLAGS.summary_levels else: