diff --git a/deepctr/feature_column.py b/deepctr/feature_column.py index a80a6f84..31629f95 100644 --- a/deepctr/feature_column.py +++ b/deepctr/feature_column.py @@ -14,6 +14,23 @@ DEFAULT_GROUP_NAME = "default_group" +def _is_string_dtype(dtype): + try: + return tf.as_dtype(dtype) == tf.string + except TypeError: + return dtype == "string" + + +def _check_sparse_feature_dtype(fc): + if _is_string_dtype(fc.dtype) and not fc.use_hash: + raise ValueError( + "SparseFeat(name='{}', dtype='string') requires use_hash=True " + "so string ids can be converted before embedding lookup. " + "Alternatively, encode the feature values to integer ids before " + "passing them to DeepCTR.".format(fc.name) + ) + + class SparseFeat(namedtuple('SparseFeat', ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'vocabulary_path', 'dtype', 'embeddings_initializer', 'embedding_name', @@ -129,12 +146,14 @@ def build_input_features(feature_columns, prefix=''): input_features = OrderedDict() for fc in feature_columns: if isinstance(fc, SparseFeat): + _check_sparse_feature_dtype(fc) input_features[fc.name] = Input( shape=(1,), name=prefix + fc.name, dtype=fc.dtype) elif isinstance(fc, DenseFeat): input_features[fc.name] = Input( shape=(fc.dimension,), name=prefix + fc.name, dtype=fc.dtype) elif isinstance(fc, VarLenSparseFeat): + _check_sparse_feature_dtype(fc) input_features[fc.name] = Input(shape=(fc.maxlen,), name=prefix + fc.name, dtype=fc.dtype) if fc.weight_name is not None: diff --git a/deepctr/inputs.py b/deepctr/inputs.py index 3c2bdbae..aeadc439 100644 --- a/deepctr/inputs.py +++ b/deepctr/inputs.py @@ -16,6 +16,27 @@ from .layers.utils import Hash +def _create_embedding_layer(feat, l2_reg, prefix, name_suffix, mask_zero=False): + emb = Embedding(feat.vocabulary_size, feat.embedding_dim, + embeddings_initializer=feat.embeddings_initializer, + embeddings_regularizer=l2(l2_reg), + name=prefix + '_' + name_suffix + '_' + feat.embedding_name, + mask_zero=mask_zero) + emb.trainable = feat.trainable + return emb + + +def _check_embedding_compatible(embedding_name, existing_feat, feat): + for attr in ('vocabulary_size', 'embedding_dim', 'trainable'): + if getattr(existing_feat, attr) != getattr(feat, attr): + raise ValueError( + "Feature columns with the same embedding_name must share the same " + "{}. embedding_name='{}' has {} and {}.".format( + attr, embedding_name, getattr(existing_feat, attr), getattr(feat, attr) + ) + ) + + def get_inputs_list(inputs): return list(chain(*list(map(lambda x: x.values(), filter(lambda x: x is not None, inputs))))) @@ -23,25 +44,30 @@ def get_inputs_list(inputs): def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, seed, l2_reg, prefix='sparse_', seq_mask_zero=True): sparse_embedding = {} + embedding_feature_dict = {} + varlen_embedding_names = set( + feat.embedding_name for feat in varlen_sparse_feature_columns + ) if varlen_sparse_feature_columns else set() + for feat in sparse_feature_columns: - emb = Embedding(feat.vocabulary_size, feat.embedding_dim, - embeddings_initializer=feat.embeddings_initializer, - embeddings_regularizer=l2(l2_reg), - name=prefix + '_emb_' + feat.embedding_name) - emb.trainable = feat.trainable - sparse_embedding[feat.embedding_name] = emb + embedding_name = feat.embedding_name + if embedding_name in sparse_embedding: + _check_embedding_compatible(embedding_name, embedding_feature_dict[embedding_name], feat) + continue + mask_zero = seq_mask_zero and feat.embedding_name in varlen_embedding_names + emb = _create_embedding_layer(feat, l2_reg, prefix, 'emb', mask_zero) + sparse_embedding[embedding_name] = emb + embedding_feature_dict[embedding_name] = feat if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0: for feat in varlen_sparse_feature_columns: - # if feat.name not in sparse_embedding: - emb = Embedding(feat.vocabulary_size, feat.embedding_dim, - embeddings_initializer=feat.embeddings_initializer, - embeddings_regularizer=l2( - l2_reg), - name=prefix + '_seq_emb_' + feat.name, - mask_zero=seq_mask_zero) - emb.trainable = feat.trainable + embedding_name = feat.embedding_name + if embedding_name in sparse_embedding: + _check_embedding_compatible(embedding_name, embedding_feature_dict[embedding_name], feat) + continue + emb = _create_embedding_layer(feat, l2_reg, prefix, 'seq_emb', seq_mask_zero) sparse_embedding[feat.embedding_name] = emb + embedding_feature_dict[feat.embedding_name] = feat return sparse_embedding diff --git a/tests/feature_test.py b/tests/feature_test.py index 35005fb7..31690955 100644 --- a/tests/feature_test.py +++ b/tests/feature_test.py @@ -1,6 +1,8 @@ from deepctr.models import DeepFM from deepctr.feature_column import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names +from deepctr.inputs import create_embedding_matrix import numpy as np +import pytest def test_long_dense_vector(): @@ -28,3 +30,31 @@ def test_feature_column_sparsefeat_vocabulary_path(): vlsf = VarLenSparseFeat(sf, 6) if vlsf.vocabulary_path != vocab_path: raise ValueError("vlsf.vocabulary_path is invalid") + + +def test_create_embedding_matrix_reuses_same_embedding_name(): + feature_columns = [ + SparseFeat('item_id', 4, embedding_dim=8), + SparseFeat('item_id_copy', 4, embedding_dim=8, embedding_name='item_id'), + VarLenSparseFeat(SparseFeat('hist_item_id', 4, embedding_dim=8, embedding_name='item_id'), maxlen=3), + VarLenSparseFeat(SparseFeat('neg_hist_item_id', 4, embedding_dim=8, embedding_name='item_id'), maxlen=3), + ] + + embedding_dict = create_embedding_matrix(feature_columns, l2_reg=0, seed=1024) + + if list(embedding_dict.keys()) != ['item_id']: + raise AssertionError("Expected a single shared embedding keyed by 'item_id'") + if embedding_dict['item_id'].name != 'sparse_emb_item_id': + raise AssertionError("Expected the shared embedding layer to use the embedding_name-based layer name") + if embedding_dict['item_id'].mask_zero is not True: + raise AssertionError("Expected shared sequence embeddings to preserve mask_zero") + + +def test_create_embedding_matrix_rejects_inconsistent_shared_embedding(): + feature_columns = [ + SparseFeat('item_id', 4, embedding_dim=8), + VarLenSparseFeat(SparseFeat('hist_item_id', 5, embedding_dim=8, embedding_name='item_id'), maxlen=3), + ] + + with pytest.raises(ValueError, match="same embedding_name"): + create_embedding_matrix(feature_columns, l2_reg=0, seed=1024) diff --git a/tests/models/DIEN_test.py b/tests/models/DIEN_test.py index 0cad9bf3..ec226c15 100644 --- a/tests/models/DIEN_test.py +++ b/tests/models/DIEN_test.py @@ -9,10 +9,10 @@ def get_xy_fd(use_neg=False, hash_flag=False): - feature_columns = [SparseFeat('user', 3, hash_flag), - SparseFeat('gender', 2, hash_flag), - SparseFeat('item', 3 + 1, hash_flag), - SparseFeat('item_gender', 2 + 1, hash_flag), + feature_columns = [SparseFeat('user', 3, use_hash=hash_flag), + SparseFeat('gender', 2, use_hash=hash_flag), + SparseFeat('item', 3 + 1, embedding_dim=8, use_hash=hash_flag), + SparseFeat('item_gender', 2 + 1, embedding_dim=4, use_hash=hash_flag), DenseFeat('score', 1)] feature_columns += [ diff --git a/tests/models/MTL_test.py b/tests/models/MTL_test.py index a18a6b64..e4791863 100644 --- a/tests/models/MTL_test.py +++ b/tests/models/MTL_test.py @@ -1,6 +1,7 @@ import pytest import tensorflow as tf +from deepctr.feature_column import SparseFeat from deepctr.models.multitask import SharedBottom, ESMM, MMOE, PLE from ..utils_mtl import get_mtl_test_data, check_mtl_model @@ -27,6 +28,17 @@ def test_ESMM(): check_mtl_model(model, model_name, x, y_list, task_types=['binary', 'binary']) +def test_ESMM_string_sparse_requires_hash(): + with pytest.raises(ValueError, match="use_hash=True"): + ESMM([SparseFeat('user_id', 10, dtype='string')], tower_dnn_hidden_units=(8,)) + + +def test_ESMM_string_sparse_with_hash(): + model = ESMM([SparseFeat('user_id', 10, use_hash=True, dtype='string')], tower_dnn_hidden_units=(8,)) + if len(model.outputs) != 2: + raise AssertionError("Expected ESMM to build two task outputs") + + def test_MMOE(): if tf.__version__ == "1.15.0": # slow in tf 1.15 return