Skip to content
14 changes: 9 additions & 5 deletions scripts/2-process/gcs_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,7 +311,9 @@ def main():

# Count data
file1_count = shared.path_join(PATHS["data_1-fetch"], "gcs_1_count.csv")
count_data = pd.read_csv(file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
count_data = shared.open_data_file(
LOGGER, file1_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
)
process_product_totals(args, count_data)
process_latest_prior_retired_totals(args, count_data)
process_totals_by_free_cultural(args, count_data)
Expand All @@ -321,17 +323,19 @@ def main():
file2_language = shared.path_join(
PATHS["data_1-fetch"], "gcs_2_count_by_language.csv"
)
language_data = pd.read_csv(
file2_language, usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"]
language_data = shared.open_data_file(
LOGGER,
file2_language,
usecols=["TOOL_IDENTIFIER", "LANGUAGE", "COUNT"],
)
process_totals_by_language(args, language_data)

# Country data
file3_country = shared.path_join(
PATHS["data_1-fetch"], "gcs_3_count_by_country.csv"
)
country_data = pd.read_csv(
file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
country_data = shared.open_data_file(
LOGGER, file3_country, usecols=["TOOL_IDENTIFIER", "COUNTRY", "COUNT"]
)
process_totals_by_country(args, country_data)

Expand Down
4 changes: 3 additions & 1 deletion scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,9 @@ def main():
shared.git_fetch_and_merge(args, PATHS["repo"])

file_count = shared.path_join(PATHS["data_1-fetch"], "github_1_count.csv")
count_data = pd.read_csv(file_count, usecols=["TOOL_IDENTIFIER", "COUNT"])
count_data = shared.open_data_file(
LOGGER, file_count, usecols=["TOOL_IDENTIFIER", "COUNT"]
)
process_totals_by_license(args, count_data)
process_totals_by_restriction(args, count_data)

Expand Down
4 changes: 3 additions & 1 deletion scripts/2-process/wikipedia_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ def main():
file_count = shared.path_join(
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
)
count_data = pd.read_csv(file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"])
count_data = shared.open_data_file(
LOGGER, file_count, usecols=["LANGUAGE_NAME_EN", "COUNT"]
)
process_language_representation(args, count_data)
process_highest_language_usage(args, count_data)
process_least_language_usage(args, count_data)
Expand Down
20 changes: 10 additions & 10 deletions scripts/3-report/gcs_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import traceback

# Third-party
import pandas as pd
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer
Expand Down Expand Up @@ -80,7 +79,7 @@ def gcs_intro(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool product"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
total_count = f"{data['Count'].sum():,d}"
shared.update_readme(
args,
Expand Down Expand Up @@ -111,7 +110,8 @@ def plot_products(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool product"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)

data = data[::-1] # reverse order

title = "Products totals and percentages"
Expand Down Expand Up @@ -156,7 +156,7 @@ def plot_tool_status(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(name_label, ascending=False, inplace=True)

title = "CC legal tools status"
Expand Down Expand Up @@ -199,7 +199,7 @@ def plot_latest_tools(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(name_label, ascending=False, inplace=True)

title = "Latest CC legal tools"
Expand Down Expand Up @@ -241,7 +241,7 @@ def plot_prior_tools(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(name_label, ascending=False, inplace=True)

title = "Prior CC legal tools"
Expand Down Expand Up @@ -286,7 +286,7 @@ def plot_retired_tools(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "CC legal tool"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(name_label, ascending=False, inplace=True)

title = "Retired CC legal tools"
Expand Down Expand Up @@ -332,7 +332,7 @@ def plot_countries_highest_usage(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Country"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
total_count = f"{data['Count'].sum():,d}"
data.sort_values(data_label, ascending=False, inplace=True)
data = data[:10] # limit to highest 10
Expand Down Expand Up @@ -385,7 +385,7 @@ def plot_languages_highest_usage(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Language"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
total_count = f"{data['Count'].sum():,d}"
data.sort_values(data_label, ascending=False, inplace=True)
data = data[:10] # limit to highest 10
Expand Down Expand Up @@ -439,7 +439,7 @@ def plot_free_culture(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Category"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)

title = "Approved for Free Cultural Works"
plt = plot.combined_plot(
Expand Down
12 changes: 4 additions & 8 deletions scripts/3-report/github_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import traceback

# Third-party
import pandas as pd
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer
Expand Down Expand Up @@ -77,11 +76,8 @@ def load_data(args):
PATHS["data"], f"{selected_quarter}", "1-fetch", "github_1_count.csv"
)

if not os.path.exists(file_path):
LOGGER.error(f"Data file not found: {file_path}")
return pd.DataFrame()
data = shared.open_data_file(LOGGER, file_path)

data = pd.read_csv(file_path)
LOGGER.info(f"Data loaded from {file_path}")
return data

Expand All @@ -97,7 +93,7 @@ def github_intro(args):
)
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "TOOL_IDENTIFIER"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
total_repositories = data.loc["Total public repositories", "COUNT"]
cc_total = data[data.index.str.startswith("CC")]["COUNT"].sum()
cc_percentage = f"{(cc_total / total_repositories) * 100:.2f}%"
Expand Down Expand Up @@ -152,7 +148,7 @@ def plot_totals_by_license_type(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "License"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(data_label, ascending=True, inplace=True)
title = "Totals by license type"
plt = plot.combined_plot(
Expand Down Expand Up @@ -201,7 +197,7 @@ def plot_totals_by_restriction(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Category"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(name_label, ascending=False, inplace=True)
title = "Totals by restriction"
plt = plot.combined_plot(
Expand Down
13 changes: 7 additions & 6 deletions scripts/3-report/wikipedia_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import traceback

# Third-party
import pandas as pd
from pygments import highlight
from pygments.formatters import TerminalFormatter
from pygments.lexers import PythonTracebackLexer
Expand Down Expand Up @@ -87,9 +86,11 @@ def wikipedia_intro(args):
)
name_label = "LANGUAGE_NAME_EN"
name_label_top10 = "Language"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
total_articles = data["COUNT"].sum()
top10 = pd.read_csv(file_path_top10, index_col=name_label_top10)
top10 = shared.open_data_file(
LOGGER, file_path_top10, index_col=name_label_top10
)
top10_articles = top10["Count"].sum()
top10_percentage = (top10_articles / total_articles) * 100
average_articles = total_articles / len(data)
Expand Down Expand Up @@ -131,7 +132,7 @@ def plot_language_representation(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Category"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(data_label, ascending=True, inplace=True)
title = "Language Representation"
plt = plot.combined_plot(
Expand Down Expand Up @@ -176,7 +177,7 @@ def plot_highest_language_usage(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Language"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(data_label, ascending=True, inplace=True)
title = "Most represented languages"
plt = plot.combined_plot(
Expand Down Expand Up @@ -219,7 +220,7 @@ def plot_least_language_usage(args):
LOGGER.info(f"data file: {file_path.replace(PATHS['repo'], '.')}")
name_label = "Language"
data_label = "Count"
data = pd.read_csv(file_path, index_col=name_label)
data = shared.open_data_file(LOGGER, file_path, index_col=name_label)
data.sort_values(data_label, ascending=True, inplace=True)
title = "Least represented languages"
plt = plot.combined_plot(
Expand Down
33 changes: 33 additions & 0 deletions scripts/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from datetime import datetime, timezone

# Third-party
import pandas as pd
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
from pandas import PeriodIndex
from requests import Session
Expand Down Expand Up @@ -66,6 +67,38 @@ def get_session(accept_header=None, session=None):
return session


def open_data_file(
logger,
file_path,
usecols=None,
index_col=None,
):
"""
Open a CSV data file safely and convert expected errors into
QuantifyingException. This shared function ensures all process/report
scripts benefit from the same error handling.
"""
try:
# Reading the file
return pd.read_csv(file_path, usecols=usecols, index_col=index_col)
# File does not exist
except FileNotFoundError:
raise QuantifyingException(
message=f"Data file not found: {file_path}", exit_code=1
)
# Empty or invalid CSV file
except pd.errors.EmptyDataError:
raise QuantifyingException(
message=f"CSV file is empty or invalid: {file_path}", exit_code=1
)
# Permission denied
except PermissionError:
raise QuantifyingException(
message=f"Permission denied when accessing data file: {file_path}",
exit_code=1,
)


def git_fetch_and_merge(args, repo_path, branch=None):
if not args.enable_git:
return
Expand Down