Cannot create queue pair with ib_create_qp
Asked Answered
T

2

7

I am writing an RDMA (InfiniBand) kernel module.

Up to now I have been successful creating the protection domain, completion queues for send and receive queues.

But whenever I try to create the Queue Pair by invoking ib_create_qp, it is failing to create the queue pair. The code which I'm wrote is shown below:

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
    printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
        printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
        printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
    union ib_gid tmp_gid;
    int ret;
    int hcaport = 1;
    int result = -ENOMEM;
    u16 port1Pkey;
    struct ib_port_attr attr;

        ret = ib_query_port(device,hcaport,&attr);
        printk("ib query port result %d  \n", ret);

//  Creating the Protection Domain for RDMA
    mypd = ib_alloc_pd(device);

    if(IS_ERR(mypd)){
        printk(KERN_INFO "Failed to allocate PD\n");
        return;
    }
    else{
        printk(KERN_INFO "1Successfully allocated the PD\n");
        pdset = true;
    }

//  Creating the receive completion queue for RDMA
    myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
        if(IS_ERR(myrcvcq)){
                pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
    else{
        printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
    }

//  Creating the send completion queue for RDMA
    myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
        if(IS_ERR(myClientsendcq)){
                pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
        }

//  Creating the queue pair
//      Creating the queue pair

        struct ib_qp_init_attr init_qpattr;

        memset(&init_qpattr,0,sizeof(init_qpattr));
        init_qpattr.event_handler = myClient_qp_event_handler;
        init_qpattr.cap.max_send_wr = 2;
        init_qpattr.cap.max_recv_wr = 2;
        init_qpattr.cap.max_recv_sge = 1;
        init_qpattr.cap.max_send_sge = 1;
        init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
        init_qpattr.qp_type = IB_QPT_UD;
        init_qpattr.send_cq = myClientsendcq;
        init_qpattr.recv_cq = myrcvcq;

        myClientqp = ib_create_qp(mypd,&init_qpattr);

        if(IS_ERR(myClientqp)){
                pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk(KERN_INFO "1The queue pair is successfully created \n");
                qpcreated = true;
        }



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
        .name   = "myRDMAclient",
        .add    = myClient_add_one,
        .remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
    int ret;

    ret = ib_register_client(&my_client);
    if(ret){
        //printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
        goto err_sa;
    }
    printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
    return 0;

err_sa:


    return ret;
}


module_init(myRDMAclient_init);

Here all the queries works except the ib_create_qp(mypd,&init_qpattr); which fails to create the queue pair.

Updated: Registered the memory before creating Queue Pair. But still it is showing invalid argument error (error code -22) for ib_create_qp

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/err.h>
#include "myClient.h"


struct workqueue_struct *myClient_workqueue;
struct ib_sa_client myClient_sa_client;
/*
static void myClient_add_one(struct ib_device *device);
static void myClient_remove_one(struct ib_device *device);
*/

struct ib_pd *mypd;
struct ib_cq *myrcvcq;
struct ib_cq *myClientsendcq;
struct ib_qp *myClientqp;
struct ib_mr *mymr;

void myClient_ib_recvcompletion(struct ib_cq *cq)
{
    printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}


void myClient_ib_sendcompletion(struct ib_cq *cq)
{
        printk("A user-specified callback that is invoked when a completion event occurs on the CQ.\n");
}
static void my_qp_event_handler(struct ib_event *myqpAsyncEvent, void *anyPointer)
{
        printk(KERN_INFO "Dummy affiliated asynchronous event occured function called \n");
}


static void myClient_add_one(struct ib_device *device)
{
    union ib_gid tmp_gid;
    int ret;
    int hcaport = 1;
    int result = -ENOMEM;
    u16 port1Pkey;
    struct ib_port_attr attr;

        ret = ib_query_port(device,hcaport,&attr);
        printk("ib query port result %d  \n", ret);

//  Creating the Protection Domain for RDMA
    mypd = ib_alloc_pd(device);

    if(IS_ERR(mypd)){
        printk(KERN_INFO "Failed to allocate PD\n");
        return;
    }
    else{
        printk(KERN_INFO "1Successfully allocated the PD\n");
        pdset = true;
    }
// Registering Memory
    mymr = ib_get_dma_mr(mypd,IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ| IB_ACCESS_REMOTE_WRITE);
    if(IS_ERR(mymr)){
            printk("failed to register memory :( %d \n",PTR_ERR(mymr));
    }else{
            printk(KERN_INFO "Successfully registered memory region :) \n");
    }
// End Registering Memory
//  Creating the receive completion queue for RDMA
    myrcvcq = ib_create_cq(device,myClient_ib_recvcompletion,NULL,NULL,myClient_recvq_size,0);
        if(IS_ERR(myrcvcq)){
                pr_err("%s:%d error code for receive cq%d\n", __func__, __LINE__, PTR_ERR(myrcvcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
    else{
        printk("Recieve CQ successfully created in address: %x \n",myrcvcq);
    }

//  Creating the send completion queue for RDMA
    myClientsendcq = ib_create_cq(device,myClient_ib_sendcompletion, NULL, NULL,myClient_sendq_size,0 );
        if(IS_ERR(myClientsendcq)){
                pr_err("%s:%d scqerror code for send cq%d\n", __func__, __LINE__, PTR_ERR(myClientsendcq));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk("1Send CQ successfully created in address: %x \n",myClientsendcq);
        }

//  Creating the queue pair
//      Creating the queue pair

        struct ib_qp_init_attr init_qpattr;

        memset(&init_qpattr,0,sizeof(init_qpattr));
        init_qpattr.event_handler = myClient_qp_event_handler;
        init_qpattr.cap.max_send_wr = 2;
        init_qpattr.cap.max_recv_wr = 2;
        init_qpattr.cap.max_recv_sge = 1;
        init_qpattr.cap.max_send_sge = 1;
        init_qpattr.sq_sig_type = IB_SIGNAL_ALL_WR;
        init_qpattr.qp_type = IB_QPT_UD;
        init_qpattr.send_cq = myClientsendcq;
        init_qpattr.recv_cq = myrcvcq;

        myClientqp = ib_create_qp(mypd,&init_qpattr);

        if(IS_ERR(myClientqp)){
                pr_err("%s:%d error code %d\n", __func__, __LINE__, PTR_ERR(myClientqp));
                //printk("Error creating QP: %d \n",PTR_ERR(myClientqp));
        }
        else{
                printk(KERN_INFO "1The queue pair is successfully created \n");
                qpcreated = true;
        }



}
static void myClient_remove_one(struct ib_device *device)
{
}

static struct ib_client my_client = {
        .name   = "myRDMAclient",
        .add    = myClient_add_one,
        .remove = myClient_remove_one
};


static int __init myRDMAclient_init(void)
{
    int ret;

    ret = ib_register_client(&my_client);
    if(ret){
        //printk(KERN_ALERT "KERN_ERR Failed to register IB client\n");
        goto err_sa;
    }
    printk(KERN_ALERT "lKERN_INFO Successfully registered myRDMAclient module \n");
    return 0;

err_sa:


    return ret;
}


module_init(myRDMAclient_init);
Tarsuss answered 14/1, 2016 at 11:43 Comment(10)
What is the error message printed when IS_ERR(myClientqp) is true?Spirited
It says myClient_add_one :<line number> error code -22Tarsuss
OK. so what does error code 22 mean?Spirited
Is there a file in linux kernel directory where I can find the meaning of this codes ? I want it because different webpages are mentioning different meanings for itTarsuss
22 is EINVAL. It's saying one of the parameters you passed in is invalid. I don't see anything obviously wrong with your code; what kernel version and low-level IB driver (mlx4, mthca, etc) are you using?Telly
kernel version - 3.0.76-0.11 , IB driver mlx4Tarsuss
one more question... what architecture? 32-bit x86, 64-bit x86, something else?Telly
Do you have IB working on the system? For example does ib_ipoib module work and create an ib0 interface you can send and receive traffic on?Telly
Yes Roland. I already have ib_ipoib module installed. I can see ib0 and ib1 interfaces. ib0 and ib1 and bonded with bond0 interface. And I can ping to other ib_ipoib systems using these interfaces.Tarsuss
@Tarsuss If you have been able to fix this please let us know how.Ascariasis
T
3

UPDATE:

Based on the discussion in the comments below, I'm guessing you installed Mellanox OFED drivers on top of your current distribution. Looking at the 3.1-1.0.3 source of Mellanox OFED kernel drivers, I see that they changed the layout of struct ib_qp_init_attr by adding some fields. I'm pretty sure that your problem is that you're building your module against the original SLE 3.0.76-0.11 kernel headers, so the init_qpattr structure your passing to the create QP function does not have the values you set up in the right places.

I don't know how you've installed the new out-of-tree drivers, so I can't tell you exactly how to build your module properly, but you could try adding something like

    init_qpattr.qpg_type = 0;

to where you set up the struct. (I know you memset the whole thing to zero already, but this will make sure that the headers you're building against have the new qpg_type member for the structure. I think that's a new field added by OFED that isn't in your original kernel headers, so if your module compiles, then you're building against the right headers)

OLD ANSWER:

So I suspect that you are running into a bug in the mlx4 driver related to creating such a small QP (max_send_wr == max_recv_wr == 2 and max_send_sge == max_recv_sge == 1). I managed to find the source for the 3.0.76-0.11 kernel you're using, and I don't see any obvious bug, unfortunately.

Some things you could try to help debug this

  1. Add a module parameter debug_level=1 to the mlx4_core module when loading it. Update your question with all the output from driver initialization (a bunch of lines about "Max CQEs:" etc. There is a fair amount of logic in the mlx4 driver that depend on the parameters returned by fimrware during initialization, and this output would let us see what those are.
  2. For that matter, it's worth checking if your HCA firmware is up-to-date — you may get better results with newer firmware (although the driver should work anyway, you might be hitting a bug in untested driver code because of a missing firmware feature that triggers a different code path).
  3. Try updating your code to increase those parameters. You could try increasing the max_send_sge and max_recv_sge to 2 and increase max_send_wr and max_recv_wr to, say, 32 or 128. (Try increasing those indvidually or in combination)
  4. If you know how to enable the function tracer (This LWN article is helpful; I'm assuming the old SLES kernel has all the required features), then enabling tracing for the mlx4_ib and mlx4_core modules and then loading your module would be great. If you update your question with the trace, then we can look at where the create QP operation is failing — for example, is it failing in set_rq_size(), getting to set_kernel_sq_size() or failing somewhere else?
Telly answered 21/1, 2016 at 22:10 Comment(14)
Did you run this code on your 3.0.76-0.11 kernel ? Does it successfully created the Queue Pair ? I think I didn't have the debug option enabled in my kernel so could not used your suggestion num. 1 and 4. I did checked with suggestion number 3 but this also showed the same "Invalid Argument" error code -22. Suggestion 2, I'm working on it would need some time to confirm.Tarsuss
I didn't go so far as to install SLES11, so I didn't actually try your code. I'm really mystified that IPoIB is able to run successfully on your system (and create a QP) while your code fails. The IPoIB driver does set a few flags on QP creation that you don't, but I don't see how that could make a difference. Are you sure that you're building your module against the right kernel headers?Telly
I looked at the SLES kernel config and AFAICT CONFIG_MLX4_DEBUG is set to 'y", so you can at least do #1. Looks like you are correct that unfortunately the function graph tracer is not enabled. If you're able to rebuild your kernel, enabling that and/or just adding printks to the mlx4 driver would let you debug this much more easily.Telly
I have used kprobe to trace. I found that "mlx4_ib_create_qp" function is getting executed everytime I load my module. But inside that function I have another function "check_qpg_attr". But this "check_qpg_attr" is never executed. (I was just counting the number of calling using kprobe). Important thing to note is before this "check_qpg_attr" function call there are 4 "if" block all of which returns -EINVAL (Invalid argument error). Let me know if this is helpful info to youTarsuss
The four "if" blocks are - if (mlx4_qp_flags & ~(MLX4_IB_QP_LSO | MLX4_IB_QP_CAP_CROSS_CHANNEL | MLX4_IB_QP_CAP_MANAGED_SEND | MLX4_IB_QP_CAP_MANAGED_RECV | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK | MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF)) return ERR_PTR(-EINVAL);Tarsuss
if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) { if (init_attr->qp_type != IB_QPT_UD) return ERR_PTR(-EINVAL); }Tarsuss
if ((mlx4_qp_flags & (MLX4_IB_QP_CAP_CROSS_CHANNEL | MLX4_IB_QP_CAP_MANAGED_SEND | MLX4_IB_QP_CAP_MANAGED_RECV)) && !(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL)) { pr_debug("%s Does not support cross-channel operations\n", to_mdev(device)->ib_dev.name); return ERR_PTR(-EINVAL); }Tarsuss
if ((init_attr->create_flags & ~(IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV)) && (((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP) && init_attr->qp_type != IB_QPT_UD) || ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP) && init_attr->qp_type > IB_QPT_GSI))) return ERR_PTR(-EINVAL);Tarsuss
But I don't know the values of this enum symbols. If you know then please let me know so that I can try with them and check again.Tarsuss
Are you running a straight SLES11 SP3 install, or did you install OFED or Mellanox OFED drivers? Can you let me know how I can get the same driver source you're referring to? I'm looking at kernel.opensuse.org/cgit/kernel/plain/drivers/infiniband/hw/… and it doesn't match what you're looking at.Telly
Let us continue this discussion in chat.Telly
Yes, installed Mellanox OFED drivers. And I could not compile if I add init_qpattr.qpg_type = 0; In that case it shows ib_qp_init_aatr does not have any member qpg_type. Please note that Earlier I had unknown version symbols problem for all the IB exported symbols while inserting. So I copied the Module.symvers file from /usr/src/ofa_kernel/default/Module.symvers.Tarsuss
Then I could successfully compiled my module and inserted unless this ib_create_qp which is causing errors. Please let me know if this is info was useful to youTarsuss
Were you able to write the kernel module successfully?Table
S
-1

I think you forgot to register memory region. The actions you need to do before creating QP are:

  1. Creating the protection domain
  2. Register memory region
  3. Creating completion queues

and only then creating QP.

I don't know what device and lib you are using, but in Mellanox IB lib it's:

char mr_buffer[REGION_SIZE];
//mypd its your protection domain that you allocated 
struct ibv_mr *mr = ibv_reg_mr(mypd , mr_buffer, REGION_SIZE, 0);
if (!mr) {
    //ERROR MSG
}
Sacrum answered 19/1, 2016 at 9:7 Comment(3)
What is the typical REGION_SIZE value you use ?Tarsuss
This is absolutely wrong. There is no requirement to register an MR before creating a QP.Telly
@Dor marcus: I have made changes, since ib_reg_mr is not there in kernel ib_verbs.h and ib_reg_phys is not implemented, I used ib_get_dma to register. It registered memory successfully. However, I am still getting the error -22 (which is for invalid argument) for ib_create_qp. Please help. ThanksTarsuss

© 2022 - 2024 — McMap. All rights reserved.