一. DPDK源码版本: DPDK19.02
二. DPDK 初始化部分
1.初始化EAL环境,rte_eal_init()
;
2.解析参数,因为DPDK的参数在EAL初始化时就进行了解析,所以,这里主要解析的是我们自己的参数,可以使用getopt_long
函数。
3.初始化内存池等,这里要注意放在接口的初始化之前,为接收数据包做准备。
4.初始化接口
5.启动所有核上的线程。rte_eal_mp_remote_launch()
三. 下面详细讲解初始化作用过程:
2.1 EAL初始化
1)EAL功能作用:
• Intel® DPDK loading and launching
• Support for multi-process and multi-thread execution types • Core affinity/assignment procedures • System memory allocation/de-allocation • Atomic/lock operations • Time reference • PCI bus access • Trace and debug functions • CPU feature identification • Interrupt handling • Alarm operationsref: (详细可参考文章)
2) 初始化程序: 源文件eal.c
1 /* Launch threads, called at application init(). */ 2 int 3 rte_eal_init(int argc, char **argv) 4 { 5 int i, fctret, ret; 6 pthread_t thread_id; 7 static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); 8 const char *p; 9 static char logid[PATH_MAX]; 10 char cpuset[RTE_CPU_AFFINITY_STR_LEN]; 11 char thread_name[RTE_MAX_THREAD_NAME_LEN]; 12 13 /* checks if the machine is adequate */ 14 //检测cpu的标识是否支持 15 //dpdk在进行cpu运行时,会考虑采用cpu高级指令来优化运算速度。 16 if (!rte_cpu_is_supported()) { 17 rte_eal_init_alert("unsupported cpu type."); 18 rte_errno = ENOTSUP; 19 return -1; 20 } 21 22 //操作静态局部变量run_once确保函数只执行一次 23 if (!rte_atomic32_test_and_set(&run_once)) { 24 rte_eal_init_alert("already called initialization."); 25 rte_errno = EALREADY; 26 return -1; 27 } 28 29 p = strrchr(argv[0], '/'); 30 strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); 31 thread_id = pthread_self(); 32 33 //初始化结构体struct internal_config 34 eal_reset_internal_config(&internal_config); 35 36 /* set log level as early as possible */ 37 //解析命令行参数,只处理“--log-level”,保存在internal_config.log_level 38 eal_log_level_parse(argc, argv); 39 40 //获取系统中的CPU数量 41 if (rte_eal_cpu_init() < 0) { 42 rte_eal_init_alert("Cannot detect lcores."); 43 rte_errno = ENOTSUP; 44 return -1; 45 } 46 47 /* 48 EAL初始化参数: 49 -c COREMASK:要使用CPU core16进制掩码。注意core编号在不同的平台不一样,需要事先确定好。 50 -n NUM:每个处理器socket的内存通道数 51 -b domain:bus:devid.func:网口黑名单,EAL不能使用的PCI设备(可以同时存在多个-b选项) 52 –socket-mem:在指定socket上分配大页内存 53 -m MB:指定分配大大页内存数,不限处理器的socket。加以使用—socket-mem代替这个参数 54 -r NUM:内存的rank数 55 -v:显示程序版本号 56 –huge-dir:大页内存的挂载点 57 –file-prefix:大页内存文件的前缀 58 –proc-type:进程类型(primary,secondary,auto) 59 –xen-dom0:支持程序在Xen Domain0中非大页内存下运行 60 –vmware-tsc-map:使用VMware TSC代替本地的RDTSC 61 –base-virtaddr :指定虚拟地址的基址 62 –vfio-intr:指定VFIO使用的中断类型(如果不是用VFIO则无效) 63 -c是必须的,其它都是可选的。 64 */ 65 fctret = eal_parse_args(argc, argv); 66 if (fctret < 0) { 67 rte_eal_init_alert("Invalid 'command line' arguments."); 68 rte_errno = EINVAL; 69 rte_atomic32_clear(&run_once); 70 return -1; 71 } 72 73 //根据命令行参数初始化internal_config 74 if (eal_plugins_init() < 0) { 75 rte_eal_init_alert("Cannot init plugins"); 76 rte_errno = EINVAL; 77 rte_atomic32_clear(&run_once); 78 return -1; 79 } 80 81 if (eal_option_device_parse()) { 82 rte_errno = ENODEV; 83 rte_atomic32_clear(&run_once); 84 return -1; 85 } 86 87 /* 88 主应用的情况(RTE_PROC_PRIMARY) 89 rte_eal_config_create 90 eal_runtime_config_path:获取runtime配置文件路径,如“/var/run/.rte_config” 91 打开文件,上锁,mmap映射文件到内存 92 将early configuration structure(全局变量early_mem_config)拷贝到此内存中,rte_config.mem_config指向这块内存 93 映射地址保存在rte_config.mem_config->mem_cfg_addr中,用于从应用将来映射到相同的地址 94 从应用的情况(RTE_PROC_SECONDARY) 95 rte_eal_config_attach 96 eal_runtime_config_path 97 打开文件,mmap映射文件到内存 98 rte_config.mem_config指向映射的内存 99 rte_eal_mcfg_wait_complete100 如果struct rte_mem_config结构的magic成员没有被写成RTE_MAGIC,就继续等待101 (主应用ready后会将struct rte_mem_config结构的magic成员写成RTE_MAGIC)102 rte_eal_config_reattach103 从前面mmap映射文件中获取主应用mmap的映射地址(即rte_config.mem_config->mem_cfg_addr)104 munmap解除先前的映射105 指定主应用映射地址重新执行mmap映射,如果最终映射地址和指定映射地址不一致,则出错退出106 将rte_config.mem_config指向重新映射的内存107 */108 rte_config_init();109 110 111 /*112 初始化global interrupt source head113 创建pipe114 创建线程来等待处理中断,线程执行函数为eal_intr_thread_main115 线程运行循环116 epoll_create:创建epoll文件描述符117 epoll_ctl:把前面创建的the read end of the pipe,添加到epoll wait list中118 遍历以global interrupt source head为头部的struct rte_intr_source结构链表119 如果当前struct rte_intr_source结构没有挂载的callback函数,跳过120 把所有的uio device file descriptor,添加到epoll wait list中121 eal_intr_handle_interrupts122 epoll_wait:wait for an I/O event on an epoll file descriptor123 eal_intr_process_interrupts124 遍历所有发生的I/O eventc125 如果the read end of the pipe可用,执行read操作,函数返回126 遍历struct rte_intr_source结构链表,查找当前I/O event对应的structrte_intr_source结构127 根据interrupt handle type(uio/alarm/…),确定需要读取的字节长度128 执行文件read操作129 如果read数据成功,执行当前struct rte_intr_source结构挂载的所有callback函数130 调用eal_intr_process_interrupts返回负数,本次中断处理结束返回131 关闭epoll文件描述符132 如果创建线程成功,调用rte_thread_setname给线程设置名称“eal-intr-thread”133 pthread_setname_np134 循环(browse all running lcores except the master lcore)135 创建主线程与子线程通信使用的pipe136 设置子线程状态为WAIT137 创建子线程,线程执行函数为eal_thread_loop138 根据线程ID,获取当前线程的lcore_id139 获取主线程向子线程通信所用管道,子线程读取数据的file descriptor(m2s)140 获取子线程向主线程通信所用管道,子线程发送数据的file descriptor(s2m)141 eal_thread_set_affinity:设置子线程cpu affinity142 eal_thread_dump_affinity143 线程主循环144 等待读取主线程发送的命令145 设置线程状态为RUNNING146 向主线程发送ack147 读取当前lcore对应的structlcore_config结构中的lcore_function_t类型函数指针,及调用参数148 执行所指函数,并存储返回值149 设置线程状态为FINISHED150 如果创建线程成功,调用rte_thread_setname给线程设置名称“lcore-slave-xx”151 152 */153 if (rte_eal_intr_init() < 0) {154 rte_eal_init_alert("Cannot init interrupt-handling thread");155 return -1;156 }157 158 /* Put mp channel init before bus scan so that we can init the vdev159 * bus through mp channel in the secondary process before the bus scan.160 */161 /*162 多进程的情况稍微复杂一些,除了线程间的通信外,还要完成primary进程和其他secondary进程的通信163 模块初始化中的下面函数完成的(mp表示multiple process)164 其内部会单独创建一个线程用来接收来自其他进程的消息165 */166 if (rte_mp_channel_init() < 0) {167 rte_eal_init_alert("failed to init mp channel");168 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {169 rte_errno = EFAULT;170 return -1;171 }172 }173 174 /* register multi-process action callbacks for hotplug */175 //注册一个action176 if (rte_mp_dev_hotplug_init() < 0) {177 rte_eal_init_alert("failed to register mp callback for hotplug");178 return -1;179 }180 181 /*182 bus scan提供的主接口,内部会调用所有bus->scan。接口的目的是扫描所有bus下注册的设备183 bus下默认的设备路径在/sys/bus/pci/devices184 同内核扫描流程不同,DPDK只是将kernel扫描pci后建立的sysfs信息读取出来,获得内核已经扫描好的pci信息185 在linux设备模型中总线类型下挂有属于该bus的device和driver的文件夹,每个文件夹里存在具体的device指向实际的设备文件186 /sys/bus/pci/devices/187 188 */189 if (rte_bus_scan()) {190 rte_eal_init_alert("Cannot scan the buses for devices");191 rte_errno = ENODEV;192 rte_atomic32_clear(&run_once);193 return -1;194 }195 196 /* if no EAL option "--iova-mode=", use bus IOVA scheme */197 if (internal_config.iova_mode == RTE_IOVA_DC) {198 /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */199 //获取全局配置结构struct rte_config,初始指向全局变量early_mem_config200 rte_eal_get_configuration()->iova_mode =201 rte_bus_get_iommu_class();202 203 /* Workaround for KNI which requires physical address to work */204 if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&205 rte_eal_check_module("rte_kni") == 1) {206 rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;207 RTE_LOG(WARNING, EAL,208 "Some devices want IOVA as VA but PA will be used because.. "209 "KNI module inserted\n");210 }211 } else {212 rte_eal_get_configuration()->iova_mode =213 internal_config.iova_mode;214 }215 216 if (internal_config.no_hugetlbfs == 0) {217 /* rte_config isn't initialized yet */218 ret = internal_config.process_type == RTE_PROC_PRIMARY ?219 eal_hugepage_info_init() :220 eal_hugepage_info_read();221 if (ret < 0) {222 rte_eal_init_alert("Cannot get hugepage information.");223 rte_errno = EACCES;224 rte_atomic32_clear(&run_once);225 return -1;226 }227 }228 229 if (internal_config.memory == 0 && internal_config.force_sockets == 0) {230 if (internal_config.no_hugetlbfs)231 internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;232 }233 234 if (internal_config.vmware_tsc_map == 1) {235 #ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT236 rte_cycles_vmware_tsc_map = 1;237 RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "238 "you must have monitor_control.pseudo_perfctr = TRUE\n");239 #else240 RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "241 "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");242 #endif243 }244 245 rte_srand(rte_rdtsc());246 247 /*248 调用fopencookie,定义一个定制的写日志接口249 调用openlog打开日志250 rte_eal_common_log_init:251 STAILQ_INIT:初始化Singly-linked Tail queue,队头为log_history252 rte_mempool_create253 如果创建mempool失败,调用rte_mempool_lookup254 获取链接所有mempool结构链表的头结构structrte_mempool_list255 遍历链接所有mempool结构链表的所有结点256 比较struct rte_tailq_entry结构的data域指向的struct rte_mempool结构的名称,257 是否与指定名称相同258 返回找到的指向struct rte_mempool结构的指针,或NULL259 */260 if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {261 rte_eal_init_alert("Cannot init logging.");262 rte_errno = ENOMEM;263 rte_atomic32_clear(&run_once);264 return -1;265 }266 267 #ifdef VFIO_PRESENT268 if (rte_eal_vfio_setup() < 0) {269 rte_eal_init_alert("Cannot init VFIO");270 rte_errno = EAGAIN;271 rte_atomic32_clear(&run_once);272 return -1;273 }274 #endif275 /* in secondary processes, memory init may allocate additional fbarrays276 * not present in primary processes, so to avoid any potential issues,277 * initialize memzones first.278 */279 /*280 rte_memzone在DPDK的内存资源管理中起到的是其他资源管家的作用,默认情况下,281 在DPDK初始化时会创建RTE_MAX_MEMZONE个rte_memzone,282 每一个都可以记录一个rte_ring或者rte_mempool的内存位置283 每一个rte_ring或者rte_mempool都有一个指针回指到它关联的rte_memzone284 Memzone是内存分配的基本单元,mempool,malloc_heap在需要内存时,都会执行rte_memzone_reserve操作285 rte_memzone_reserve 从memseg中分配一块内存出来286 */287 if (rte_eal_memzone_init() < 0) {288 rte_eal_init_alert("Cannot init memzone");289 rte_errno = ENODEV;290 return -1;291 }292 293 /*294 1.获取所有预留hugepage的物理地址并按物理地址进行排序295 2.根据物理物理地址,虚拟地址,soket_id等将hugpages组合成memseg296 3.将所有memseg信息在所有dpdk程序间共享297 */298 if (rte_eal_memory_init() < 0) {299 rte_eal_init_alert("Cannot init memory");300 rte_errno = ENOMEM;301 return -1;302 }303 304 /* the directories are locked during eal_hugepage_info_init */305 //解锁hugepage目录(由前面的eal_hugepage_info_init函数加锁)306 eal_hugedirs_unlock();307 308 /*309 1.函数将连续的memseg使用heap的方式管理起来,heap数据抽象310 2.注册register_mp_requests311 3.rte_memseg_contig_walk遍历memseg list中连续的mem seg,然后使用malloc_add_seg将这些内存加入heap的管理312 4.heap的管理在malloc_heap_add_memory中实现313 */314 if (rte_eal_malloc_heap_init() < 0) {315 rte_eal_init_alert("Cannot init malloc heap");316 rte_errno = ENODEV;317 return -1;318 }319 320 if (rte_eal_tailqs_init() < 0) {321 rte_eal_init_alert("Cannot init tail queues for objects");322 rte_errno = EFAULT;323 return -1;324 }325 326 //赋值全局的struct rte_intr_handle结构,调用timerfd_create函数创建定时器timer对象327 if (rte_eal_alarm_init() < 0) {328 rte_eal_init_alert("Cannot init interrupt-handling thread");329 /* rte_eal_alarm_init sets rte_errno on failure. */330 return -1;331 }332 333 /*334 设定全局变量eal_timer_source为EAL_TIMER_TSC(TSC/HPET)335 set_tsc_freq:设置TSC frequency(每秒钟时钟中断的次数)336 解析文件“/proc/cpuinfo”,检查“flags”属性中“constant_tsc”和“nonstop_tsc”是否存在337 */338 if (rte_eal_timer_init() < 0) {339 rte_eal_init_alert("Cannot init HPET or TSC timers");340 rte_errno = ENOTSUP;341 return -1;342 }343 344 /*345 获取masterlcore对应的numa socket346 rte_eal_get_physmem_layout:获取struct rte_memseg结构数组地址347 遍历struct rte_memseg结构数组,检查特定struct rte_memseg结构是否存在(对应此numa socket,并且长度大于0)348 */349 eal_check_mem_on_local_socket();350 351 /*352 设置主线程的lcore_id353 eal_thread_set_affinity354 rte_sys_gettid:获取线程的tid355 设置线程的CPU亲和性,记录numasocket等信息356 */357 eal_thread_init_master(rte_config.master_lcore);358 359 //dump当前线程的CPU affinity360 ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));361 362 RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",363 rte_config.master_lcore, (uintptr_t)thread_id, cpuset,364 ret == 0 ? "" : "...");365 366 RTE_LCORE_FOREACH_SLAVE(i) {367 368 /*369 * create communication pipes between master thread370 * and children371 */372 if (pipe(lcore_config[i].pipe_master2slave) < 0)373 rte_panic("Cannot create pipe\n");374 if (pipe(lcore_config[i].pipe_slave2master) < 0)375 rte_panic("Cannot create pipe\n");376 377 lcore_config[i].state = WAIT;378 379 /* create a thread for each lcore */380 ret = pthread_create(&lcore_config[i].thread_id, NULL,381 eal_thread_loop, NULL);382 if (ret != 0)383 rte_panic("Cannot create thread\n");384 385 /* Set thread_name for aid in debugging. */386 snprintf(thread_name, sizeof(thread_name),387 "lcore-slave-%d", i);388 ret = rte_thread_setname(lcore_config[i].thread_id,389 thread_name);390 if (ret != 0)391 RTE_LOG(DEBUG, EAL,392 "Cannot set name for lcore thread\n");393 }394 395 /*396 * Launch a dummy function on all slave lcores, so that master lcore397 * knows they are all ready when this function returns.398 */399 /*指示所有子线程启动一个dummyfunction*/400 /*401 检查各个子线程/lcore的状态是否处于WAIT402 rte_eal_remote_launch:向各个子线程/lcore发送执行命令403 获取主线程向子线程通信所用管道,主线程发送数据的file descriptor(m2s)404 获取子线程向主线程通信所用管道,主线程读取数据的file descriptor(s2m)405 将lcore_function_t类型函数指针,及调用参数填入当前lcore对应的structlcore_config结构406 向子线程发送命令407 等待读取子线程发送的ack408 如果最后一个参数值为CALL_MASTER(lcore handler executed by master core),主线程也执行所指函数409 */410 rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);411 rte_eal_mp_wait_lcore();412 413 /* initialize services so vdevs register service during bus_probe. */414 ret = rte_service_init();415 if (ret) {416 rte_eal_init_alert("rte_service_init() failed");417 rte_errno = ENOEXEC;418 return -1;419 }420 421 /* Probe all the buses and devices/drivers on them */422 if (rte_bus_probe()) {423 rte_eal_init_alert("Cannot probe devices");424 rte_errno = ENOTSUP;425 return -1;426 }427 428 #ifdef VFIO_PRESENT429 /* Register mp action after probe() so that we got enough info */430 if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)431 return -1;432 #endif433 434 /* initialize default service/lcore mappings and start running. Ignore435 * -ENOTSUP, as it indicates no service coremask passed to EAL.436 */437 ret = rte_service_start_with_defaults();438 if (ret < 0 && ret != -ENOTSUP) {439 rte_errno = ENOEXEC;440 return -1;441 }442 443 /*444 * Clean up unused files in runtime directory. We do this at the end of445 * init and not at the beginning because we want to clean stuff up446 * whether we are primary or secondary process, but we cannot remove447 * primary process' files because secondary should be able to run even448 * if primary process is dead.449 *450 * In no_shconf mode, no runtime directory is created in the first451 * place, so no cleanup needed.452 */453 if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {454 rte_eal_init_alert("Cannot clear runtime directory\n");455 return -1;456 }457 458 /*459 如果是主应用,将全局内存配置struct rte_mem_config结构的magic成员写成RTE_MAGIC,460 表明主应用EAL初始化完成461 */462 rte_eal_mcfg_complete();463 464 /* Call each registered callback, if enabled */465 rte_option_init();466 467 return fctret;468 }