Skip to content
Open
Show file tree
Hide file tree
Changes from 49 commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
efef0bc
Create spamcomment model
jpangas Jan 12, 2024
bff58c5
Add New Features
jpangas Jan 15, 2024
48871dc
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 16, 2024
61b0fe0
Include new features and change spamcom
jpangas Jan 18, 2024
e31fa75
Version 0.0.534
suhaibmujahid Jan 19, 2024
5103030
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 19, 2024
a69cc54
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Jan 22, 2024
d365ad3
Create comments extractor
jpangas Jan 23, 2024
9ce864a
Remove comment features from Bug Features
jpangas Jan 23, 2024
77d534d
Add New features
jpangas Jan 24, 2024
73f74a4
Refine Link feature
jpangas Jan 25, 2024
2d65489
Test with TomekLinks
jpangas Jan 29, 2024
501a89f
Change df in text vectorizer
jpangas Jan 29, 2024
606f743
Use oversampling
jpangas Feb 2, 2024
41a73cb
Use max_step
jpangas Feb 6, 2024
586576d
Include and Refine features
jpangas Feb 7, 2024
ba7a1a1
Split Date Features
jpangas Feb 9, 2024
8f429d1
Rename features correctly
jpangas Feb 9, 2024
1ef2493
Remove Commenter Experience and Invalid Bugs
jpangas Feb 12, 2024
5a18517
Remove first comment
jpangas Feb 13, 2024
ea6c168
Include Links Dictionary
Feb 15, 2024
874b19f
Fix Error and Lint
jpangas Feb 15, 2024
b3da2e5
Refactor the Links Dictionary
jpangas Feb 15, 2024
b49485d
Use List instead
jpangas Feb 15, 2024
71fe950
Merge remote-tracking branch 'origin/master' into spamcom
jpangas Feb 16, 2024
4626064
Merge remote-tracking branch 'origin/spamcom' into spamcom
jpangas Feb 16, 2024
a7044b0
Use Dictionary for # of links
jpangas Feb 16, 2024
13772c7
Include older bugs
jpangas Feb 19, 2024
7cf0dcd
Replace Weekday with Weekend
jpangas Feb 19, 2024
cc8e6f6
Include max_delta_step
jpangas Feb 20, 2024
c4e4f22
Revert "Include max_delta_step"
jpangas Feb 20, 2024
01cca1e
Test using scale_pos_weight
jpangas Feb 20, 2024
cc42dee
Use URL Extract
jpangas Feb 20, 2024
4b8cf49
Revert to Using Regex
jpangas Feb 21, 2024
5c5da8c
Introduce new extraction func and features
jpangas Feb 22, 2024
dc16331
Include tests for extraction function
jpangas Feb 22, 2024
e5b0349
Change scale_pos_weight value
jpangas Feb 22, 2024
644795a
Change regex for extraction
jpangas Feb 22, 2024
45097da
Include tld_extract library
jpangas Feb 22, 2024
0a06ea3
Test without scale_pos_weight
jpangas Feb 22, 2024
e193764
Test with n_estimators changed
jpangas Feb 22, 2024
dda9b95
Test with GridSearch CV Values
jpangas Feb 23, 2024
5ba0c22
Remove scale_pos_weight from model.py
jpangas Feb 23, 2024
ca16b98
Set n_estimators to 1000
jpangas Feb 23, 2024
18d18f0
Revert "Remove scale_pos_weight from model.py"
jpangas Feb 23, 2024
1d35968
Remove comments which have 'redacted-
jpangas Feb 23, 2024
0a21b61
Test with new parameters
jpangas Feb 25, 2024
00a9f9f
Change df
jpangas Feb 25, 2024
f55d137
Test: Include tags as feature
jpangas Feb 26, 2024
dbcb311
Exclude comment tags
jpangas Feb 26, 2024
1b437da
Exclude emails from commit authors
jpangas Feb 27, 2024
16e14c5
Test without scale pos weight
jpangas Feb 27, 2024
94ab283
Test with scale_pos_weight adjusted
jpangas Feb 27, 2024
5a58108
Adjust scale pos weight
jpangas Feb 28, 2024
3eab988
Test wihout WeekOfYear
jpangas Mar 1, 2024
bd16d56
Include comment classifier
jpangas Mar 6, 2024
0a11f3c
Include script in setup
jpangas Mar 6, 2024
a3956b4
Fix script error
jpangas Mar 6, 2024
5c93d23
Fix setup error
jpangas Mar 6, 2024
15c8d5a
Classify all comments
jpangas Mar 7, 2024
5f953ac
Include spamcom in model names
jpangas Mar 13, 2024
df77a40
Merge remote-tracking branch 'upstream/master' into spamcom
jpangas Mar 13, 2024
4cd6c6d
Merge branch 'mozilla:master' into spamcom
jpangas Mar 13, 2024
4237f8f
Remove comment independent files
jpangas Mar 14, 2024
ba2ece2
Merge remote-tracking branch 'origin/spamcom' into spamcom
jpangas Mar 14, 2024
5490d01
Use(bug,comment) tuple
jpangas Mar 26, 2024
d95852d
Include BugvsCreator Feature
jpangas Apr 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 210 additions & 0 deletions bugbug/comment_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
# -*- coding: utf-8 -*-
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import sys
from collections import defaultdict
from datetime import datetime
from typing import Any

import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

from bugbug.utils import extract_urls_and_domains


class CommentFeature(object):
pass


class CommentExtractor(BaseEstimator, TransformerMixin):
def __init__(
self,
feature_extractors,
cleanup_functions,
):
assert len(set(type(fe) for fe in feature_extractors)) == len(
feature_extractors
), "Duplicate Feature Extractors"
self.feature_extractors = feature_extractors

assert len(set(type(cf) for cf in cleanup_functions)) == len(
cleanup_functions
), "Duplicate Cleanup Functions"
self.cleanup_functions = cleanup_functions

def fit(self, x, y=None):
for feature in self.feature_extractors:
if hasattr(feature, "fit"):
feature.fit(x())

return self

def transform(self, comments):
comments_iter = iter(comments())

commenter_experience_map = defaultdict(int)

def apply_transform(comment):
data = {}

for feature_extractor in self.feature_extractors:
res = feature_extractor(
comment,
commenter_experience=commenter_experience_map[comment["creator"]],
)

if hasattr(feature_extractor, "name"):
feature_extractor_name = feature_extractor.name
else:
feature_extractor_name = feature_extractor.__class__.__name__

if res is None:
continue

if isinstance(res, dict):
for key, value in res.items():
data[sys.intern(key)] = value
continue

if isinstance(res, (list, set)):
for item in res:
data[sys.intern(f"{item} in {feature_extractor_name}")] = True
continue

data[feature_extractor_name] = res

commenter_experience_map[comment["creator"]] += 1

comment_text = comment["text"]
for cleanup_function in self.cleanup_functions:
comment_text = cleanup_function(comment_text)

return {
"data": data,
"comment_text": comment_text,
}

return pd.DataFrame(apply_transform(comment) for comment in comments_iter)


class CommenterExperience(CommentFeature):
name = "# of Comments made by Commenter in the past"

def __call__(self, comment, commenter_experience, **kwargs):
return commenter_experience


class NumberOfLinks(CommentFeature):
name = "Number of Links in the comment"

def __init__(self, domains_to_ignore=set()):
self.known_domains = domains_to_ignore

def __call__(self, comment, **kwargs) -> Any:
domains = extract_urls_and_domains(comment["text"])["domains"]

return {
"# of Known links": sum(domain in self.known_domains for domain in domains),
"# of Unknown links": sum(
domain not in self.known_domains for domain in domains
),
"Total # of links": len(domains),
}


class CharacterCount(CommentFeature):
name = "# of Characters in the Comment"

def __call__(self, comment, **kwargs):
return len(comment["text"])


class WordCount(CommentFeature):
name = "# of Words in the Comment"

def __call__(self, comment, **kwargs):
return len(comment["text"].split())


class UnknownLinkAtBeginning(CommentFeature):
name = "Unknown Link found at Beginning of the Comment"

def __init__(self, domains_to_ignore=set()):
self.known_domains = domains_to_ignore

def __call__(self, comment, **kwargs):
urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"]

words = comment["text"].split()
return words[0] in urls if words else False


class UnknownLinkAtEnd(CommentFeature):
name = "Unknown Link found at End of the Comment"

def __init__(self, domains_to_ignore=set()):
self.known_domains = domains_to_ignore

def __call__(self, comment, **kwargs):
urls = extract_urls_and_domains(comment["text"], self.known_domains)["urls"]

words = comment["text"].split()
return words[-1] in urls if words else False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead, we could return the index for the start of the first link and the index for the end of the last link.

@jpangas wdyt?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds good. Let me try it out and I will share the results.



class HourOfDay(CommentFeature):
name = "Hour of the Day (0-23)"

def __call__(self, comment, **kwargs):
comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ")
return comment_time.hour


class Weekday(CommentFeature):
name = "Day of the Week (0-7)"

def __call__(self, comment, **kwargs):
comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ")
return comment_time.isoweekday()


class PostedOnWeekend(CommentFeature):
name = "Comment was Posted on Weekend"

def __call__(self, comment, **kwargs):
comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ")
return comment_time.isoweekday() in (5, 6)


class DayOfYear(CommentFeature):
name = "Day of the Year (0-366)"

def __call__(self, comment, **kwargs):
comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ")
return comment_time.timetuple().tm_yday


class WeekOfYear(CommentFeature):
name = "Week of Year"

def __call__(self, comment, **kwargs):
comment_time = datetime.strptime(comment["creation_time"], "%Y-%m-%dT%H:%M:%SZ")
return comment_time.isocalendar()[1]


class CommentTags(CommentFeature):
name = "Comment Tags"

def __init__(self, to_ignore=set()):
self.to_ignore = to_ignore

def __call__(self, comment, **kwargs):
tags = []
for tag in comment["tags"]:
if tag in self.to_ignore:
continue

tags.append(tag)
return tags
35 changes: 35 additions & 0 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import pickle
from collections import defaultdict
from math import sqrt
from os import makedirs, path
from typing import Any

Expand Down Expand Up @@ -174,6 +175,8 @@ def __init__(self, lemmatization=False):

self.store_dataset = False

self.use_scale_pos_weight = False

self.entire_dataset_training = False

# DBs required for training.
Expand Down Expand Up @@ -216,6 +219,8 @@ def get_human_readable_feature_names(self):
feature_name = f"Comments contain '{feature_name}'"
elif type_ == "text":
feature_name = f"Combined text contains '{feature_name}'"
elif type_ == "comment_text":
feature_name = f"Comment text contains '{feature_name}'"
elif type_ == "files":
feature_name = f"File '{feature_name}'"
elif type_ not in ("data", "couple_data"):
Expand Down Expand Up @@ -388,6 +393,21 @@ def train(self, importance_cutoff=0.15, limit=None):
# Split dataset in training and test.
X_train, X_test, y_train, y_test = self.train_test_split(X, y)

# Use scale_pos_weight to help in extremely imbalanced datasets
if self.use_scale_pos_weight and is_binary:
negative_samples = sum(label == 0 for label in y_train)
positive_samples = sum(label == 1 for label in y_train)
logger.info("Negative Samples: %d", negative_samples)
logger.info("Positive Samples: %d", positive_samples)

scale_pos_weight = sqrt(negative_samples / positive_samples) / 2

logger.info("Scale Pos Weight: %d", scale_pos_weight)

self.clf.named_steps["estimator"].set_params(
scale_pos_weight=scale_pos_weight
)

tracking_metrics = {}

# Use k-fold cross validation to evaluate results.
Expand Down Expand Up @@ -803,3 +823,18 @@ def items_gen(self, classes):
continue

yield issue, classes[issue_number]


class CommentModel(Model):
def __init__(self, lemmatization=False):
Model.__init__(self, lemmatization)
self.training_dbs = [bugzilla.BUGS_DB]

def items_gen(self, classes):
for bug in bugzilla.get_bugs():
for comment in bug["comments"]:
comment_id = comment["id"]
if comment["id"] not in classes:
continue

yield comment, classes[comment_id]
1 change: 1 addition & 0 deletions bugbug/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"regressionrange": "bugbug.models.regressionrange.RegressionRangeModel",
"regressor": "bugbug.models.regressor.RegressorModel",
"spambug": "bugbug.models.spambug.SpamBugModel",
"spamcomment": "bugbug.models.spamcomment.SpamCommentModel",
"stepstoreproduce": "bugbug.models.stepstoreproduce.StepsToReproduceModel",
"testlabelselect": "bugbug.models.testselect.TestLabelSelectModel",
"testgroupselect": "bugbug.models.testselect.TestGroupSelectModel",
Expand Down
Loading