Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions scripts/1-fetch/wikipedia_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,18 @@ def parse_arguments():
return args


def check_for_completion():
try:
with open(FILE_LANGUAGES, "r", newline="") as file_obj:
reader = csv.DictReader(file_obj, dialect="unix")
if len(list(reader)) > 300:
raise shared.QuantifyingException(
f"Data fetch completed for {QUARTER}", 0
)
except FileNotFoundError:
pass # File may not be found without --enable-save, etc.


def write_data(args, tool_data):
if not args.enable_save:
return args
Expand Down Expand Up @@ -157,6 +169,7 @@ def query_wikipedia_languages(session):
def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
check_for_completion()
shared.git_fetch_and_merge(args, PATHS["repo"])
session = shared.get_session()
tool_data = query_wikipedia_languages(session)
Expand Down
53 changes: 9 additions & 44 deletions scripts/2-process/github_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import traceback

# Third-party
# import pandas as pd
import pandas as pd

# Add parent directory so shared can be imported
Expand Down Expand Up @@ -60,6 +59,13 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -92,6 +98,7 @@ def process_totals_by_license(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_license.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


Expand Down Expand Up @@ -126,52 +133,10 @@ def process_totals_by_restriction(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "github_totals_by_restriction.csv"
)
check_for_data_file(file_path)
data_to_csv(args, data, file_path)


# def load_quarter_data(quarter):
# """
# Load data for a specific quarter.
# """
# file_path = os.path.join(PATHS["data"], f"{quarter}",
# "1-fetch", "github_fetched")
# if not os.path.exists(file_path):
# LOGGER.error(f"Data file for quarter {quarter} not found.")
# return None
# return pd.read_csv(file_path)


# def compare_data(current_quarter, previous_quarter):
# """
# Compare data between two quarters.
# """
# current_data = load_quarter_data(current_quarter)
# previous_data = load_quarter_data(previous_quarter)

# if current_data is None or previous_data is None:
# return

# Process data to compare totals


# def parse_arguments():
# """
# Parses command-line arguments, returns parsed arguments.
# """
# LOGGER.info("Parsing command-line arguments")
# parser = argparse.ArgumentParser(
# description="Google Custom Search Comparison Report")
# parser.add_argument(
# "--current_quarter", type=str, required=True,
# help="Current quarter for comparison (e.g., 2024Q3)"
# )
# parser.add_argument(
# "--previous_quarter", type=str, required=True,
# help="Previous quarter for comparison (e.g., 2024Q2)"
# )
# return parser.parse_args()


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
Expand Down
11 changes: 10 additions & 1 deletion scripts/2-process/wikipedia_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,13 @@ def parse_arguments():
return args


def check_for_data_file(file_path):
if os.path.exists(file_path):
raise shared.QuantifyingException(
f"Processed data already exists for {QUARTER}", 0
)


def data_to_csv(args, data, file_path):
if not args.enable_save:
return
Expand Down Expand Up @@ -91,6 +98,7 @@ def process_highest_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_highest_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, top_10, file_path)


Expand All @@ -114,6 +122,7 @@ def process_least_language_usage(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_least_language_usage.csv"
)
check_for_data_file(file_path)
data_to_csv(args, bottom_10, file_path)


Expand All @@ -140,14 +149,14 @@ def process_language_representation(args, count_data):
file_path = shared.path_join(
PATHS["data_phase"], "wikipedia_language_representation.csv"
)
check_for_data_file(file_path)
data_to_csv(args, language_counts, file_path)


def main():
args = parse_arguments()
shared.paths_log(LOGGER, PATHS)
shared.git_fetch_and_merge(args, PATHS["repo"])

file_count = shared.path_join(
PATHS["data_1-fetch"], "wikipedia_count_by_languages.csv"
)
Expand Down