Bert Embedding Layer raises `Type Error: unsupported operand type(s) for +: 'None Type' and 'int'` with BiLSTM

Asked 29/10, 2019 at 12:48 Answered 8/11, 2019 at 15:34

Solved python tensorflow keras recurrent-neural-network bert-language-model

I've problems integrating Bert Embedding Layer in a BiLSTM model for word sense disambiguation task,

Windows 10
Python 3.6.4
TenorFlow 1.12
Keras 2.2.4
No virtual environments were used
PyCharm Professional 2019.2

The whole script

import os
import yaml
import numpy as np
from argparse import ArgumentParser

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.layers import (LSTM, Add, Bidirectional, Dense, Input, TimeDistributed, Embedding)

from tensorflow.keras.preprocessing.sequence import pad_sequences

try:
    from bert.tokenization import FullTokenizer
except ModuleNotFoundError:
    os.system('pip install bert-tensorflow')

from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tqdm import tqdm

from keras_bert import BertEmbeddingLayer
from model_utils import visualize_plot_mdl
from parsing_dataset import load_dataset
from utilities import configure_tf, initialize_logger


def parse_args():
    parser = ArgumentParser(description="WSD")
    parser.add_argument("--model_type", default='baseline', type=str,
                        help="""Choose the model: baseline: BiLSTM Model.
                                attention: Attention Stacked BiLSTM Model.
                                seq2seq: Seq2Seq Attention.""")

    return vars(parser.parse_args())


def train_model(mdl, data, epochs=1, batch_size=32):
    [train_input_ids, train_input_masks, train_segment_ids], train_labels = data
    history = mdl.fit([train_input_ids, train_input_masks, train_segment_ids],
                      train_labels, epochs=epochs, batch_size=batch_size)
    return history


def baseline_model(output_size):
    hidden_size = 128
    max_seq_len = 64

    in_id = Input(shape=(None,), name="input_ids")
    in_mask = Input(shape=(None,), name="input_masks")
    in_segment = Input(shape=(None,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_embedding = BertEmbeddingLayer()(bert_inputs)
    embedding_size = 768

    bilstm = Bidirectional(LSTM(hidden_size, dropout=0.2,
                                recurrent_dropout=0.2,
                                return_sequences=True
                                )
                           )(bert_embedding)

    output = TimeDistributed(Dense(output_size, activation="softmax"))(bilstm)

    mdl = Model(inputs=bert_inputs, outputs=output, name="Bert_BiLSTM")

    mdl.compile(loss="sparse_categorical_crossentropy",
                optimizer='adadelta', metrics=["acc"])

    return mdl


def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)


class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  batches could cause silent errors.
  """

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The un-tokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The un-tokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def create_tokenizer_from_hub_module(bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = [0] * max_seq_length
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0: (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    example.label.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)
    example.label.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        example.label.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(tokenizer, example, max_seq_length)
        input_ids.append(np.array(input_id))
        input_masks.append(np.array(input_mask))
        segment_ids.append(np.array(segment_id))
        labels.append(np.array(label))
    return np.array(input_ids), np.array(input_masks), np.array(segment_ids), np.array(labels).reshape(-1, 1)


def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples


# Initialize session
sess = tf.Session()

params = parse_args()
initialize_logger()
configure_tf()

# Load our config file
config_file_path = os.path.join(os.getcwd(), "config.yaml")
config_file = open(config_file_path)
config_params = yaml.load(config_file)

# This parameter allow that train_x to be in form of words, to allow using of your keras-elmo layer
elmo = config_params["use_elmo"]  
dataset = load_dataset(elmo=elmo)
vocabulary_size = dataset.get("vocabulary_size")
output_size = dataset.get("output_size")

# Parse data in Bert format
max_seq_length = 64
train_x = dataset.get("train_x")
train_text = [' '.join(x) for x in train_x]
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]
# print(train_text.shape)  # (37184, 1)
train_labels = dataset.get("train_y")

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_labels)

# Extract features
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)

bert_inputs = [train_input_ids, train_input_masks, train_segment_ids]
data = bert_inputs, train_labels
del dataset

model = baseline_model(output_size)

# Instantiate variables
initialize_vars(sess)

history = train_model(model, data)

The layer BertEmbeddingLayer() is imported from strongio/keras-bert, as well as following the approach in the file to integrate my work however I always have this error, please check the traceback below (exception is raised when building the model)

Traceback (most recent call last):
  File "code/prova_bert.py", line 230, in <module>
    model = baseline_model(output_size, max_seq_len, visualize=True)
  File "code/prova_bert.py", line 165, in baseline_model
    )(bert_embeddings)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\wrappers.py", line 473, in __call__
    return super(Bidirectional, self).__call__(inputs, **kwargs)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 746, in __call__
    self.build(input_shapes)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\wrappers.py", line 612, in build
    self.forward_layer.build(input_shape)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 149, in wrapper
    output_shape = fn(instance, input_shape)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\recurrent.py", line 552, in build
    self.cell.build(step_input_shape)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\utils\tf_utils.py", line 149, in wrapper
    output_shape = fn(instance, input_shape)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\layers\recurrent.py", line 1934, in build
    constraint=self.kernel_constraint)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 609, in add_weight
    aggregation=aggregation)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\training\checkpointable\base.py", line 639, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1977, in make_variable
    aggregation=aggregation)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 183, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 146, in _variable_v1_call
    aggregation=aggregation)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 125, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2437, in default_variable_creator
    import_scope=import_scope)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\variables.py", line 187, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 297, in __init__
    constraint=constraint)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 409, in _init_from_args
    initial_value() if init_from_fn else initial_value,
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\keras\engine\base_layer.py", line 1959, in <lambda>
    shape, dtype=dtype, partition_info=partition_info)
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\ops\init_ops.py", line 473, in __call__
    scale /= max(1., (fan_in + fan_out) / 2.)
TypeError: unsupported operand type(s) for +: 'NoneType' and 'int'
Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.Session object at 0x0000026396AD0630>>
Traceback (most recent call last):
  File "C:\Users\Sheikh\AppData\Local\Programs\Python\Python36\Lib\site-packages\tensorflow\python\client\session.py", line 738, in __del__
TypeError: 'NoneType' object is not callable

Please refer to my issue on their repo and for data examples being fed to the model please check this issue

Allen answered 29/10, 2019 at 12:48 Comment(24)

If anything is not clear, please let me know. I am open to use other libraries provided how to use them. Thank you and have a great day – Allen 29/10, 2019 at 12:48

I don't think the code you're showing is the same version that's giving the exception. The first line in the traceback doesn't exist in your baseline_model function. It looks like at some point you renamed bert_embeddings to bert_output but it's impossible to be sure. Please make sure you're posting the actual code and the actual traceback associated with that code when asking for debugging help. Also, have you considered it could be a bug in keras or tensorflow? Try searching on that. – Ingather 1/11, 2019 at 22:58

please find the whole code in this gist – Allen 1/11, 2019 at 23:1

It would be better if you edited your question to include your exact code and the exact traceback you get by running it, as well as how you are running it. All of this can fit easily in your question here so without linking to an external site. – Ingather 2/11, 2019 at 8:3

@Ingather the question is updated – Allen 2/11, 2019 at 19:5

I don't know the internals of tensorflow very well, but it would appear that somewhere in your model you are expecting it to be able to guess the shape of your inputs, when in fact you will need to correctly specify that shape manually; possibly in LSTM (you are passing input_shape=(None, None, embedding_size) but it may not be agnostic to anything except maybe the batch size). – Ingather 5/11, 2019 at 10:10

I already tried that, and it did not work sadly, I still cannot figure out what is wrong.. – Allen 5/11, 2019 at 17:15

I tried running your code but I don't have enough context to make it work. Have you ever used pdb? I find that when I'm really baffled by something the quickest and easiest way to figure out what's going on is to step through the code with pdb. Even if you don't understand every part of the code you can gain insight. Here the error you're getting is with the variable fan_in in this operand initialization routine for a model. It's getting set to None when it should be an int. So you can try to trace where fan_in came from and why it's None. – Ingather 6/11, 2019 at 10:16

In this case it's slightly complicated of course, but most of the frames in your traceback seem to be through various layers of abstraction that tensorflow uses to support inputs of different shapes. – Ingather 6/11, 2019 at 10:17

I really appreciate your help, I will try to use pdb as you advised. – Allen 6/11, 2019 at 20:5

Do you just need bertEmbeddings which you want to feed into LSTM? or do you want to train a BERT along with LSTM layer on top of it? – Humeral 7/11, 2019 at 17:54

If I check the current implementation of github.com/strongio/keras-bert/blob/…; It looks very much different from what you have used. Can you please try to use their new package? – Humeral 7/11, 2019 at 18:10

also change in_id = Input(shape=(None,), name="input_ids") to in_id = Input(shape=(max_sequence_len,), name="input_ids") ................. and same with in_mask and in_mask.Since you are setting them to None, It is an assumption that its empty and bert outputs None. Thus casuing an error with LSTM layer. – Humeral 7/11, 2019 at 18:13

@AshwinGeetD'Sa I just want to fine tune Bert Embeddings on my dataset, I changed it to the new package as you asked, and replaced the Nones as well, however, no change. From what I know when we set shape=(None, ) it is basically the same if we assigned a value, it just tells TF that it should expect a number here not None – Allen 7/11, 2019 at 18:45

I actually tried to execute your code, bu due to the absence of dataset and other packages, couldǹt debug much. – Humeral 8/11, 2019 at 7:58

Did you try fitting a simple NN instead of LSTM, as shown in the example? Because, I have used their code before n it has worked. – Humeral 8/11, 2019 at 8:15

Solved it: First of all, the results by "mean" or "first" pooling is not for all the tokens, so you got to change in call() function as in next comment: – Humeral 8/11, 2019 at 9:25

` elif self.pooling == "mean": result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[ "sequence_output" ] #mul_mask = ... #masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / ( # tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) #input_mask = tf.cast(input_mask, tf.float32) #pooled = masked_reduce_mean(result, input_mask) pooled = result else: ... ` – Humeral 8/11, 2019 at 9:26

` in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids") in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks") in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids") bert_inputs = [in_id, in_mask, in_segment] bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs) bert_output = tf.keras.layers.Reshape((max_seq_length,768))(bert_output) bilstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output) ` – Humeral 8/11, 2019 at 9:27

The above comment mush be used in build model() – Humeral 8/11, 2019 at 9:28

@AshwinGeetD'Sa this is what you meant. correct if I am wrong, it worked – Allen 8/11, 2019 at 13:55

@AshwinGeetD'Sa however in the model.fit(...) I have another error,

ValueError: Error when checking target: expected time_distributed to have 3 dimensions, but got array with shape (37176, 1)

– Allen 8/11, 2019 at 13:55

Yes, that is what I meant. Give me some time, will check with time_distributed layer – Humeral 8/11, 2019 at 14:39

Let us continue this discussion in chat. – Humeral 8/11, 2019 at 14:52

First of all, the results by "mean" or "first" pooling is not for all the tokens, so you got to change in call() function:

elif self.pooling == "mean": 
    result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["sequence_output" ] 
    pooled = result

In build_model, change to:

embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids") 
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")

bert_inputs = [in_id, in_mask, in_segment] 
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs) 
bert_output = Reshape((max_seq_length, embedding_size))(bert_output) 

bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
output = Dense(output_size, activation="softmax")(bilstm)

Nereid answered 8/11, 2019 at 15:34 Comment(3)

I am trying to solve the same problem and am using your code, but I get the error below, Any ideas how to fix this? ValueError: A target array with shape (9300, 1) was passed for an output of shape (None, 256, 1) while using as loss binary_crossentropy. This loss expects targets to have the same shape as the output. – Ungodly 15/1, 2020 at 1:27

@Ungodly did you solve it ? as i got

ValueError: Shapes must be equal rank, but are 2 and 3 From merging shape 0 with other shapes. for '{{node reshape/Shape/packed}} = Pack[N=2, T=DT_FLOAT, axis=0](Placeholder, Placeholder_1)' with input shapes: [?,768], [?,?,768].

– Collotype 23/1, 2022 at 23:17

@user: can you verify if you have a sequential output or outputs corresponding to the CLS token only? – Humeral 23/1, 2022 at 23:48

Hot tags

Godot Unity Godot Help Programming Godot 4.X GUI GDScript 3D 2D Physics CSharp Godot 3.X VR XR Projects C++

Recommended topics

Hot tags