I am trying to build a multi-modal deep learning network that takes in 2 inputs: image and metadata. The metadata is in the shape (2206, 4) and the image data is in (256, 256, 3). Here is my code:

#define two input layers
img_input = tf.keras.layers.Input(shape = (256,256,3), name = "image")
csv_input = tf.keras.layers.Input(shape = (1503,4), name = "csv")

# define layers for image data
x1 = tf.keras.layers.experimental.preprocessing.Rescaling(1./255)(img_input)
x1 = tf.keras.layers.Convolution2D(16, 3, padding='same', activation='relu', name = 'conv1_img')(x1)
x1 = tf.keras.layers.MaxPooling2D(name="mxp1_img")(x1)
x1 = tf.keras.layers.Conv2D(32, 3, padding='same', activation='relu', name="conv2_img")(x1)
x1 = tf.keras.layers.MaxPooling2D(name="mxp2_img")(x1)
x1 = tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu', name="conv3_img")(x1)
x1 = tf.keras.layers.MaxPooling2D(name="mxp3_img")(x1)
x1 = tf.keras.layers.Flatten(name="flatten_img")(x1)

# define layers for CSV data
x2 = tf.keras.layers.Flatten(name="flatten_csv")(csv_input)
x2 = tf.keras.layers.Dense(16, activation='relu', name="dense1_csv")(x2)
x2 = tf.keras.layers.Dense(32, activation='relu', name="dense2_csv")(x2)
x2 = tf.keras.layers.Dense(64, activation='relu', name="dense3_csv")(x2)

# merge layers
x = tf.keras.layers.concatenate([x1,x2], name="concat_csv_img")
x = tf.keras.layers.Dense(128, activation='relu', name="dense1_csv_img")(x)
output = tf.keras.layers.Dense(1, name="classify")(x) #CHANGE THIS TO 2 IF IT DOESN'T WORK

#make model with 2 inputs and 1 output
model = tf.keras.models.Model(inputs=[img_input, csv_input], outputs=output) 

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.summary()

Above is creating the model

df = pd.read_csv("/Filtered Metadata.csv")
feature_columns = ['dx_type', 'age', 'sex', 'localization']
target_column = 'dx'
dx_type_mapping = {'histo': 0, 'consensus': 1, 'confocal': 2}
sex_mapping = {'male': 0, 'female': 1, 'unknown': 2}
localization_mapping = {
    'lower extremity': 0, 'face': 1, 'genital': 2,
    'chest': 3, 'back': 4, 'upper extremity': 5,
    'trunk': 6, 'neck': 7, 'scalp': 8, 'abdomen': 9,
    'hand': 10, 'foot': 11, 'unknown': 12, 'ear': 13
}
dx_mapping = {'bkl': 0, 'mel':1}

df['dx_type'] = df['dx_type'].replace(dx_type_mapping)
df['sex'] = df['sex'].replace(sex_mapping)
df['localization'] = df['localization'].replace(localization_mapping)
df['dx'] = df['dx'].replace(dx_mapping)

X = df[feature_columns]
y = df[target_column]
batch_size = 9
X_train_csv, X_test_csv, y_train_csv, y_test_csv = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = False)
for i in range(0, len(X_train_csv), batch_size):
    X_batch_train_csv = X_train_csv[i:i+batch_size]
    y_batch_train_csv = y_train_csv[i:i+batch_size]
    X_batch_test_csv = X_test_csv[i:i+batch_size]
    y_batch_test_csv = y_test_csv[i:i+batch_size]

Above I split the dataframe into train and test.

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import os
img_folder = "C:/Users/nithi/Downloads/MM Data - Copy"
def load_and_preprocess_image(image_path, target_size):
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img) / 255.0  # Normalize pixel values
    return img_array
image_files = [os.path.join(img_folder, filename) for filename in os.listdir(img_folder)]
images = []
for image_path in image_files:
    img_array = load_and_preprocess_image(image_path, target_size=(256, 256))
    images.append(img_array)
X = np.array(images)
X_train_img, X_test_img, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle = False)

for i in range(0, len(X_train_img), batch_size):
    X_batch_train_img = X_train_img[i:i+batch_size]
    X_batch_test_img = X_test_img[i:i+batch_size]

Above I split images into train and test

model.fit([X_batch_train_img, X_batch_train_csv], y_batch_train_csv, epochs=500, batch_size=32, validation_data=([X_batch_test_img, X_batch_test_csv], y_batch_test_csv))

Above I am running the model.

What I am trying to do is make sure the sizes fit so that the input shape is correct. I ran

X_batch_train_csv.shape

and got (9,4). I also ran

X_train_csv.shape

and got (1503, 4)

I am getting this error: ValueError: Input 1 of layer “model_1” is incompatible with the layer: expected shape=(None, 1503, 4), found shape=(None, 4). However, my X_train_csv.shape is in the correct shape so I’m not sure what’s going on.