diff --git a/bugbug/comment_features.py b/bugbug/comment_features.py new file mode 100644 index 0000000000..814b9109c5 --- /dev/null +++ b/bugbug/comment_features.py @@ -0,0 +1,229 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import sys +from datetime import datetime +from typing import Any + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin + +from bugbug.utils import extract_urls_and_domains + + +class CommentFeature(object): + pass + + +class CommentExtractor(BaseEstimator, TransformerMixin): + def __init__( + self, + feature_extractors, + cleanup_functions, + ): + assert len(set(type(fe) for fe in feature_extractors)) == len( + feature_extractors + ), "Duplicate Feature Extractors" + self.feature_extractors = feature_extractors + + assert len(set(type(cf) for cf in cleanup_functions)) == len( + cleanup_functions + ), "Duplicate Cleanup Functions" + self.cleanup_functions = cleanup_functions + + def fit(self, x, y=None): + for feature in self.feature_extractors: + if hasattr(feature, "fit"): + feature.fit(x()) + + return self + + def transform(self, items): + items_iter = iter(items()) + + def apply_transform(item): + bug, comment = item + data = {} + + for feature_extractor in self.feature_extractors: + res = feature_extractor( + item, + ) + + if hasattr(feature_extractor, "name"): + feature_extractor_name = feature_extractor.name + else: + feature_extractor_name = feature_extractor.__class__.__name__ + + if res is None: + continue + + if isinstance(res, dict): + for key, value in res.items(): + data[sys.intern(key)] = value + continue + + if isinstance(res, (list, set)): + for item in res: + data[sys.intern(f"{item} in {feature_extractor_name}")] = True + continue + + data[feature_extractor_name] = res + + comment_text = comment["text"] + for cleanup_function in self.cleanup_functions: + comment_text = cleanup_function(comment_text) + + return { + "data": data, + "comment_text": comment_text, + } + + return pd.DataFrame(apply_transform(item) for item in items_iter) + + +class CommentCreatorIsBugCreator(CommentFeature): + name = "Comment Creator is the Bug Creator" + + def __call__(self, item, **kwargs) -> Any: + bug, comment = item + + return bug["creator"] == comment["creator"] + + +class NumberOfLinks(CommentFeature): + name = "Number of Links in the comment" + + def __init__(self, domains_to_ignore=set()): + self.known_domains = domains_to_ignore + + def __call__(self, item, **kwargs) -> Any: + _, comment = item + + domains = extract_urls_and_domains(comment["text"])["domains"] + + return { + "# of Known links": sum(domain in self.known_domains for domain in domains), + "# of Unknown links": sum( + domain not in self.known_domains for domain in domains + ), + "Total # of links": len(domains), + } + + +class CharacterCount(CommentFeature): + name = "# of Characters in the Comment" + + def __call__(self, item, **kwargs): + _, comment = item + + return len(comment["text"]) + + +class WordCount(CommentFeature): + name = "# of Words in the Comment" + + def __call__(self, item, **kwargs): + _, comment = item + + return len(comment["text"].split()) + + +class UnknownLinkAtBeginning(CommentFeature): + name = "Unknown Link found at Beginning of the Comment" + + def __init__(self, domains_to_ignore=set()): + self.known_domains = domains_to_ignore + + def __call__(self, item, **kwargs): + _, comment = item + + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] + + words = comment["text"].split() + return words[0] in urls if words else False + + +class UnknownLinkAtEnd(CommentFeature): + name = "Unknown Link found at End of the Comment" + + def __init__(self, domains_to_ignore=set()): + self.known_domains = domains_to_ignore + + def __call__(self, item, **kwargs): + _, comment = item + + urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"] + + words = comment["text"].split() + return words[-1] in urls if words else False + + +class HourOfDay(CommentFeature): + name = "Hour of the Day (0-23)" + + def __call__(self, item, **kwargs): + _, comment = item + + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.hour + + +class Weekday(CommentFeature): + name = "Day of the Week (0-7)" + + def __call__(self, item, **kwargs): + _, comment = item + + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.isoweekday() + + +class PostedOnWeekend(CommentFeature): + name = "Comment was Posted on Weekend" + + def __call__(self, item, **kwargs): + _, comment = item + + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.isoweekday() in (5, 6) + + +class DayOfYear(CommentFeature): + name = "Day of the Year (0-366)" + + def __call__(self, item, **kwargs): + _, comment = item + + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.timetuple().tm_yday + + +class WeekOfYear(CommentFeature): + name = "Week of Year" + + def __call__(self, item, **kwargs): + _, comment = item + + comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ") + return comment_time.isocalendar()[1] + + +class CommentTags(CommentFeature): + name = "Comment Tags" + + def __init__(self, to_ignore=set()): + self.to_ignore = to_ignore + + def __call__(self, item, **kwargs): + _, comment = item + tags = [] + + for tag in comment["tags"]: + if tag in self.to_ignore: + continue + + tags.append(tag) + return tags diff --git a/bugbug/model.py b/bugbug/model.py index 92de0c34f2..a0699979f6 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -174,6 +174,8 @@ def __init__(self, lemmatization=False): self.store_dataset = False + self.use_scale_pos_weight = False + self.entire_dataset_training = False # DBs required for training. @@ -216,6 +218,8 @@ def get_human_readable_feature_names(self): feature_name = f"Comments contain '{feature_name}'" elif type_ == "text": feature_name = f"Combined text contains '{feature_name}'" + elif type_ == "comment_text": + feature_name = f"Comment text contains '{feature_name}'" elif type_ == "files": feature_name = f"File '{feature_name}'" elif type_ not in ("data", "couple_data"): @@ -388,6 +392,21 @@ def train(self, importance_cutoff=0.15, limit=None): # Split dataset in training and test. X_train, X_test, y_train, y_test = self.train_test_split(X, y) + # Use scale_pos_weight to help in extremely imbalanced datasets + if self.use_scale_pos_weight and is_binary: + negative_samples = sum(label == 0 for label in y_train) + positive_samples = sum(label == 1 for label in y_train) + logger.info("Negative Samples: %d", negative_samples) + logger.info("Positive Samples: %d", positive_samples) + + scale_pos_weight = (negative_samples / positive_samples) / 10 + + logger.info("Scale Pos Weight: %d", scale_pos_weight) + + self.clf.named_steps["estimator"].set_params( + scale_pos_weight=scale_pos_weight + ) + tracking_metrics = {} # Use k-fold cross validation to evaluate results. @@ -803,3 +822,18 @@ def items_gen(self, classes): continue yield issue, classes[issue_number] + + +class CommentModel(Model): + def __init__(self, lemmatization=False): + Model.__init__(self, lemmatization) + self.training_dbs = [bugzilla.BUGS_DB] + + def items_gen(self, classes): + for bug in bugzilla.get_bugs(): + for comment in bug["comments"]: + comment_id = comment["id"] + if comment["id"] not in classes: + continue + + yield (bug, comment), classes[comment_id] diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py index 1441e59bea..cc48fcca64 100644 --- a/bugbug/models/__init__.py +++ b/bugbug/models/__init__.py @@ -29,6 +29,7 @@ "regressionrange": "bugbug.models.regressionrange.RegressionRangeModel", "regressor": "bugbug.models.regressor.RegressorModel", "spambug": "bugbug.models.spambug.SpamBugModel", + "spamcomment": "bugbug.models.spamcomment.SpamCommentModel", "stepstoreproduce": "bugbug.models.stepstoreproduce.StepsToReproduceModel", "testlabelselect": "bugbug.models.testselect.TestLabelSelectModel", "testgroupselect": "bugbug.models.testselect.TestGroupSelectModel", diff --git a/bugbug/models/spamcomment.py b/bugbug/models/spamcomment.py new file mode 100644 index 0000000000..6d24a07ed9 --- /dev/null +++ b/bugbug/models/spamcomment.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this file, +# You can obtain one at http://mozilla.org/MPL/2.0/. + +import logging + +import xgboost +from imblearn.over_sampling import BorderlineSMOTE +from imblearn.pipeline import Pipeline as ImblearnPipeline +from sklearn.compose import ColumnTransformer +from sklearn.feature_extraction import DictVectorizer +from sklearn.pipeline import Pipeline + +from bugbug import bugzilla, comment_features, feature_cleanup, repository, utils +from bugbug.model import CommentModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SAFE_DOMAINS = {"github.com", "mozilla.com", "mozilla.org"} + + +class SpamCommentModel(CommentModel): + def __init__(self, lemmatization=True): + CommentModel.__init__(self, lemmatization) + + self.calculate_importance = False + + self.use_scale_pos_weight = True + + self.commit_emails = { + commit["author_email"] + for commit in repository.get_commits(include_backouts=True) + } + + feature_extractors = [ + comment_features.NumberOfLinks(SAFE_DOMAINS), + comment_features.WordCount(), + comment_features.HourOfDay(), + comment_features.DayOfYear(), + comment_features.Weekday(), + comment_features.UnknownLinkAtBeginning(SAFE_DOMAINS), + comment_features.UnknownLinkAtEnd(SAFE_DOMAINS), + comment_features.CommentCreatorIsBugCreator(), + ] + + cleanup_functions = [ + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), + ] + + self.extraction_pipeline = Pipeline( + [ + ( + "comment_extractor", + comment_features.CommentExtractor( + feature_extractors, cleanup_functions + ), + ), + ] + ) + + self.clf = ImblearnPipeline( + [ + ( + "union", + ColumnTransformer( + [ + ("data", DictVectorizer(), "data"), + ( + "comment_text", + self.text_vectorizer(min_df=0.0001), + "comment_text", + ), + ] + ), + ), + ( + "sampler", + BorderlineSMOTE(random_state=0), + ), + ( + "estimator", + xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count()), + ), + ] + ) + + @staticmethod + def __download_older_bugs_with_spam_comments() -> None: + """Retrieve older bugs within the past specified number of months which have spam comments. + + This function provides an option to extend the dataset used for model training by including older spam comments. + """ + params = { + "f1": "comment_tag", + "o1": "substring", + "v1": "spam", + "product": bugzilla.PRODUCTS, + } + + logger.info("Downloading older bugs...") + bugs_ids = bugzilla.get_ids(params) + older_bugs = bugzilla.download_bugs(bugs_ids) + + logger.info("%d older bugs have been downloaded.", len(older_bugs)) + + def is_safe_comment(self, comment) -> bool: + """Determines if a comment is certainly safe (not spam) based on certain conditions. + + This function applies filtering rules to identify comments that are likely + authored by legitimate contributors or bots. Such comments are definitely not spam. + """ + return any( + [ + comment["creator"] in self.commit_emails, + "@mozilla" in comment["creator"], + "@softvision" in comment["creator"], + ] + ) + + def get_labels(self): + classes = {} + + self.__download_older_bugs_with_spam_comments() + + for bug in bugzilla.get_bugs(): + for comment in bug["comments"]: + comment_id = comment["id"] + + if any( + [ + comment["count"] == "0", + self.is_safe_comment(comment), + "[redacted -" in comment["text"], + "(comment removed)" in comment["text"], + ] + ): + continue + + if "spam" in comment["tags"]: + classes[comment_id] = 1 + else: + classes[comment_id] = 0 + + logger.info( + "%d comments are classified as non-spam", + sum(label == 0 for label in classes.values()), + ) + logger.info( + "%d comments are classified as spam", + sum(label == 1 for label in classes.values()), + ) + + return classes, [0, 1] + + def items_gen(self, classes): + return ( + ((bug, comment), classes[comment["id"]]) + for bug in bugzilla.get_bugs() + for comment in bug["comments"] + if comment["id"] in classes + ) + + def get_feature_names(self): + return self.clf.named_steps["union"].get_feature_names_out() + + def overwrite_classes(self, comments, classes, probabilities): + for i, comment in enumerate(comments): + if self.is_safe_comment(comment): + if probabilities: + classes[i] = [1.0, 0.0] + else: + classes[i] = 0 + + return classes diff --git a/bugbug/utils.py b/bugbug/utils.py index d04778bd4f..e7bba56f19 100644 --- a/bugbug/utils.py +++ b/bugbug/utils.py @@ -27,6 +27,7 @@ import requests import scipy import taskcluster +import tldextract import zstandard from pkg_resources import DistributionNotFound from requests.packages.urllib3.util.retry import Retry @@ -558,3 +559,32 @@ def escape_markdown(text: str) -> str: def keep_as_is(x): """A tokenizer that does nothing.""" return x + + +def extract_urls_and_domains(text: str, domains_to_ignore: set = set()) -> dict: + """Extracts URLs and domains from a given text, optionally filtering out ignored domains. + + Args: + - text: The input text string where URLs and domains need to be found. + - domains_to_ignore: A set of domain names to exclude from the results. e.g. mozilla.com + + Returns: + A dictionary containing: + - "urls": A list of extracted URLs. + - "domains": A list of extracted domain names (excluding ignored domains if provided). + (Note: current domain extraction is basic and has limitations) + """ + pattern = re.compile(r"(?:https?://|www\.)(?:[^\s/?#]+)+(?:[\/?#][^\s]*)?") + potential_urls = pattern.findall(text) + + domains = [] + urls = [] + + for url in potential_urls: + url_info = tldextract.extract(url) + domain = url_info.registered_domain + if domain and domain not in domains_to_ignore: + domains.append(domain) + urls.append(url) + + return {"urls": urls, "domains": domains} diff --git a/requirements.txt b/requirements.txt index 26ad78dc73..a8cdf8968d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ shap[plots]==0.44.1 tabulate==0.9.0 taskcluster==60.4.2 tenacity==8.2.3 +tldextract==5.1.1 tqdm==4.66.2 xgboost==2.0.3 zstandard==0.22.0 diff --git a/tests/test_utils.py b/tests/test_utils.py index fb1e700eab..6b93f62293 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -466,3 +466,111 @@ def test_StructuredColumnTransformer() -> None: .view(np.dtype("int64")), ColumnTransformer(transformers).fit_transform(df), ) + + +@pytest.mark.parametrize( + "test_input, expected_urls, expected_domains", + [ + ("This is a sample text without any links.", [], []), + ( + "Visit https://www.testdomain.com for more info.", + ["https://www.testdomain.com"], + ["testdomain.com"], + ), + ( + "Links: http://www.example.com but ignore https://www.mozilla.com", + ["http://www.example.com"], + ["example.com"], + ), + ( + "Check out https://example.org ,sign up on www.anothersite.net and proceed to https://firefox.mozilla.org", + ["https://example.org", "www.anothersite.net"], + ["example.org", "anothersite.net"], + ), + ( + "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and proceed to www.test.mozilla.org", + ["https://www.example.org.uk", "www.anothersite.net.ac"], + ["example.org.uk", "anothersite.net.ac"], + ), + ( + "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and proceed to https://firefox.mozilla.com/download/macos", + ["http://example.com/a/abc/cat.jpg", "www.anothersite.net/abc/cde"], + ["example.com", "anothersite.net"], + ), + ( + "Visit https://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit https://www.mozilla.com/signup", + [ + "https://www.example.org.uk/a/abc/cat.jpg", + "www.anothersite.net.ac/abc/cde", + ], + ["example.org.uk", "anothersite.net.ac"], + ), + ], +) +def test_url_extraction_ignore_domains(test_input, expected_urls, expected_domains): + """Tests extraction of URLs and domains while ignoring some domains""" + domains_to_ignore = {"mozilla.com", "mozilla.org"} + result = utils.extract_urls_and_domains(test_input, domains_to_ignore) + + assert result["urls"] == expected_urls + assert result["domains"] == expected_domains + + +@pytest.mark.parametrize( + "test_input, expected_urls, expected_domains", + [ + ("This is a sample text without any links.", [], []), + ( + "Visit https://www.testdomain.com for more info.", + ["https://www.testdomain.com"], + ["testdomain.com"], + ), + ( + "Links: http://www.example.com , but do not ignore https://www.mozilla.com", + ["http://www.example.com", "https://www.mozilla.com"], + ["example.com", "mozilla.com"], + ), + ( + "Check out https://example.org ,sign up on www.anothersite.net and proceed to https://firefox.mozilla.org", + [ + "https://example.org", + "www.anothersite.net", + "https://firefox.mozilla.org", + ], + ["example.org", "anothersite.net", "mozilla.org"], + ), + ( + "Visit https://www.example.org.uk ,sign up on www.anothersite.net.ac and proceed to www.test.mozilla.org", + [ + "https://www.example.org.uk", + "www.anothersite.net.ac", + "www.test.mozilla.org", + ], + ["example.org.uk", "anothersite.net.ac", "mozilla.org"], + ), + ( + "Check out http://example.com/a/abc/cat.jpg ,sign up on www.anothersite.net/abc/cde and proceed to https://firefox.mozilla.com/download/macos", + [ + "http://example.com/a/abc/cat.jpg", + "www.anothersite.net/abc/cde", + "https://firefox.mozilla.com/download/macos", + ], + ["example.com", "anothersite.net", "mozilla.com"], + ), + ( + "Visit http://www.example.org.uk/a/abc/cat.jpg ,sign up on www.anothersite.net.ac/abc/cde and visit https://www.mozilla.com/signup", + [ + "http://www.example.org.uk/a/abc/cat.jpg", + "www.anothersite.net.ac/abc/cde", + "https://www.mozilla.com/signup", + ], + ["example.org.uk", "anothersite.net.ac", "mozilla.com"], + ), + ], +) +def test_url_extraction(test_input, expected_urls, expected_domains): + """Tests extraction of URLs and domains without ignoring domains""" + result = utils.extract_urls_and_domains(test_input) + + assert result["urls"] == expected_urls + assert result["domains"] == expected_domains