Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions deepctr/feature_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@
DEFAULT_GROUP_NAME = "default_group"


def _is_string_dtype(dtype):
try:
return tf.as_dtype(dtype) == tf.string
except TypeError:
return dtype == "string"


def _check_sparse_feature_dtype(fc):
if _is_string_dtype(fc.dtype) and not fc.use_hash:
raise ValueError(
"SparseFeat(name='{}', dtype='string') requires use_hash=True "
"so string ids can be converted before embedding lookup. "
"Alternatively, encode the feature values to integer ids before "
"passing them to DeepCTR.".format(fc.name)
)


class SparseFeat(namedtuple('SparseFeat',
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer',
'embedding_name',
Expand Down Expand Up @@ -129,12 +146,14 @@ def build_input_features(feature_columns, prefix=''):
input_features = OrderedDict()
for fc in feature_columns:
if isinstance(fc, SparseFeat):
_check_sparse_feature_dtype(fc)
input_features[fc.name] = Input(
shape=(1,), name=prefix + fc.name, dtype=fc.dtype)
elif isinstance(fc, DenseFeat):
input_features[fc.name] = Input(
shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype)
elif isinstance(fc, VarLenSparseFeat):
_check_sparse_feature_dtype(fc)
input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name,
dtype=fc.dtype)
if fc.weight_name is not None:
Expand Down
54 changes: 40 additions & 14 deletions deepctr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,32 +16,58 @@
from .layers.utils import Hash


def _create_embedding_layer(feat, l2_reg, prefix, name_suffix, mask_zero=False):
emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=feat.embeddings_initializer,
embeddings_regularizer=l2(l2_reg),
name=prefix + '_' + name_suffix + '_' + feat.embedding_name,
mask_zero=mask_zero)
emb.trainable = feat.trainable
return emb


def _check_embedding_compatible(embedding_name, existing_feat, feat):
for attr in ('vocabulary_size', 'embedding_dim', 'trainable'):
if getattr(existing_feat, attr) != getattr(feat, attr):
raise ValueError(
"Feature columns with the same embedding_name must share the same "
"{}. embedding_name='{}' has {} and {}.".format(
attr, embedding_name, getattr(existing_feat, attr), getattr(feat, attr)
)
)


def get_inputs_list(inputs):
return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs)))))


def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg,
prefix='sparse_', seq_mask_zero=True):
sparse_embedding = {}
embedding_feature_dict = {}
varlen_embedding_names = set(
feat.embedding_name for feat in varlen_sparse_feature_columns
) if varlen_sparse_feature_columns else set()

for feat in sparse_feature_columns:
emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=feat.embeddings_initializer,
embeddings_regularizer=l2(l2_reg),
name=prefix + '_emb_' + feat.embedding_name)
emb.trainable = feat.trainable
sparse_embedding[feat.embedding_name] = emb
embedding_name = feat.embedding_name
if embedding_name in sparse_embedding:
_check_embedding_compatible(embedding_name, embedding_feature_dict[embedding_name], feat)
continue
mask_zero = seq_mask_zero and feat.embedding_name in varlen_embedding_names
emb = _create_embedding_layer(feat, l2_reg, prefix, 'emb', mask_zero)
sparse_embedding[embedding_name] = emb
embedding_feature_dict[embedding_name] = feat

if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
for feat in varlen_sparse_feature_columns:
# if feat.name not in sparse_embedding:
emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=feat.embeddings_initializer,
embeddings_regularizer=l2(
l2_reg),
name=prefix + '_seq_emb_' + feat.name,
mask_zero=seq_mask_zero)
emb.trainable = feat.trainable
embedding_name = feat.embedding_name
if embedding_name in sparse_embedding:
_check_embedding_compatible(embedding_name, embedding_feature_dict[embedding_name], feat)
continue
emb = _create_embedding_layer(feat, l2_reg, prefix, 'seq_emb', seq_mask_zero)
sparse_embedding[feat.embedding_name] = emb
embedding_feature_dict[feat.embedding_name] = feat
return sparse_embedding


Expand Down
30 changes: 30 additions & 0 deletions tests/feature_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
from deepctr.inputs import create_embedding_matrix
import numpy as np
import pytest


def test_long_dense_vector():
Expand Down Expand Up @@ -28,3 +30,31 @@ def test_feature_column_sparsefeat_vocabulary_path():
vlsf = VarLenSparseFeat(sf, 6)
if vlsf.vocabulary_path != vocab_path:
raise ValueError("vlsf.vocabulary_path is invalid")


def test_create_embedding_matrix_reuses_same_embedding_name():
feature_columns = [
SparseFeat('item_id', 4, embedding_dim=8),
SparseFeat('item_id_copy', 4, embedding_dim=8, embedding_name='item_id'),
VarLenSparseFeat(SparseFeat('hist_item_id', 4, embedding_dim=8, embedding_name='item_id'), maxlen=3),
VarLenSparseFeat(SparseFeat('neg_hist_item_id', 4, embedding_dim=8, embedding_name='item_id'), maxlen=3),
]

embedding_dict = create_embedding_matrix(feature_columns, l2_reg=0, seed=1024)

if list(embedding_dict.keys()) != ['item_id']:
raise AssertionError("Expected a single shared embedding keyed by 'item_id'")
if embedding_dict['item_id'].name != 'sparse_emb_item_id':
raise AssertionError("Expected the shared embedding layer to use the embedding_name-based layer name")
if embedding_dict['item_id'].mask_zero is not True:
raise AssertionError("Expected shared sequence embeddings to preserve mask_zero")


def test_create_embedding_matrix_rejects_inconsistent_shared_embedding():
feature_columns = [
SparseFeat('item_id', 4, embedding_dim=8),
VarLenSparseFeat(SparseFeat('hist_item_id', 5, embedding_dim=8, embedding_name='item_id'), maxlen=3),
]

with pytest.raises(ValueError, match="same embedding_name"):
create_embedding_matrix(feature_columns, l2_reg=0, seed=1024)
8 changes: 4 additions & 4 deletions tests/models/DIEN_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@


def get_xy_fd(use_neg=False, hash_flag=False):
feature_columns = [SparseFeat('user', 3, hash_flag),
SparseFeat('gender', 2, hash_flag),
SparseFeat('item', 3 + 1, hash_flag),
SparseFeat('item_gender', 2 + 1, hash_flag),
feature_columns = [SparseFeat('user', 3, use_hash=hash_flag),
SparseFeat('gender', 2, use_hash=hash_flag),
SparseFeat('item', 3 + 1, embedding_dim=8, use_hash=hash_flag),
SparseFeat('item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag),
DenseFeat('score', 1)]

feature_columns += [
Expand Down
12 changes: 12 additions & 0 deletions tests/models/MTL_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
import tensorflow as tf

from deepctr.feature_column import SparseFeat
from deepctr.models.multitask import SharedBottom, ESMM, MMOE, PLE
from ..utils_mtl import get_mtl_test_data, check_mtl_model

Expand All @@ -27,6 +28,17 @@ def test_ESMM():
check_mtl_model(model, model_name, x, y_list, task_types=['binary', 'binary'])


def test_ESMM_string_sparse_requires_hash():
with pytest.raises(ValueError, match="use_hash=True"):
ESMM([SparseFeat('user_id', 10, dtype='string')], tower_dnn_hidden_units=(8,))


def test_ESMM_string_sparse_with_hash():
model = ESMM([SparseFeat('user_id', 10, use_hash=True, dtype='string')], tower_dnn_hidden_units=(8,))
if len(model.outputs) != 2:
raise AssertionError("Expected ESMM to build two task outputs")


def test_MMOE():
if tf.__version__ == "1.15.0": # slow in tf 1.15
return
Expand Down
Loading