千家信息网

分析数据库实现原理

发表于:2024-09-22 作者:千家信息网编辑
千家信息网最后更新 2024年09月22日,本篇内容介绍了"分析数据库实现原理"的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!Hash连接,如内存
千家信息网最后更新 2024年09月22日分析数据库实现原理

本篇内容介绍了"分析数据库实现原理"的有关知识,在实际案例的操作过程中,不少人都会遇到这样的困境,接下来就让小编带领大家学习一下如何处理这些情况吧!希望大家仔细阅读,能够学有所成!

Hash连接,如内存足够,首先遍历内表创建Hash表,然后遍历外表,对连接键计算HashCode,如一致,则遍历Hash表中具有同一HashCode的链表,值一致,则返回该值。
如内存不够,可遍历两张表,使用同样的Hash函数把表拆分为N个Hash"分区",遍历内表每一个Hash分区和外表相应的Hash分区,如找到与连接键值一致的数据,则返回该值。

详见代码注释.

#include #include #include "hash_join.h"#define MAX_ELEMENTS 1024//生成hash codestatic int generate_hashcode(int n){  return n % HASH_BUCKET;}//生成hash桶(写入到文件中,以文件的方式模拟)static int generate_bucket(FILE *file,char *tag){  printf("----------- generate_bucket ---------- \n");  //数组  char buf[MAX_BYTES];  FILE *fd = NULL;  for(;!feof(file);)  {    int x = read_int(file,buf);    if(x == 0)      break;    int hashcode = generate_hashcode(x);    char filename[30];    sprintf(filename,"/cygdrive/d/tmp/hash/%s_%d.csv",tag,hashcode);    //printf("Hash code is %d,Bucket filename is %s.\n",hashcode,filename);    fd = fopen(filename,"a");    if(fd == NULL)    {      printf("Can not open file %s.\n",filename);      return 0;    }      //写入文件中    write_int(fd,x);    fclose(fd);  }  return 1;}//把hash表加载到内存中,适用于内存足够的情况//使用二维数组模拟Hash表,D1 : hash桶,D2 : 桶中的数据static int load_hashtable(int ht[][MAX_ELEMENTS]){  printf("----------- load_hashtable ---------- \n");  for(int i=0;i < HASH_BUCKET;i++)  {    //循环桶号    char filename[MAX_BYTES];    //读文件    sprintf(filename,"/cygdrive/d/tmp/hash/inner_%d.csv",i);    FILE *fd = fopen(filename,"r");    if(fd == NULL){      //printf("Can not open file : %s\n",filename);      continue;    }    int j=0;    char buf[MAX_BYTES];    for(;!feof(fd) && j < MAX_ELEMENTS;)    {      //把文件内容放到数组中      int x = read_int(fd,buf);      ht[i][j++] = x;    }    fclose(fd);  }  return 1;}//使用内存创建hash表进行hash连接static void hash_join_onmemory(FILE *outerfile,FILE *innerfile){  printf("----------- hash_join_onmemory ---------- \n");  int ht[HASH_BUCKET][MAX_ELEMENTS];  char buffer[MAX_BYTES];  int flag = 0;  //创建hash bucket文件  flag = generate_bucket(innerfile,"inner");  if(!flag)  {    printf("Can not generate bucket file!\n");    return;  }  //加载到hash表中(二维数组模拟)  flag = load_hashtable(ht);  if(!flag)  {    printf("Can not load hash table!\n");    return;  }  //遍历第二个文件,执行JOIN  for(;!feof(outerfile);)  {    //读第二个文件,执行join    int outer = read_int(outerfile,buffer);    //计算hashcode    int hashcode = generate_hashcode(outer);    for(int i=0;i < MAX_ELEMENTS;i++)    {      //遍历hash桶中的数据,找到对应的数据      if(ht[hashcode][i] == outer)      {        printf("Found one,hash bucket is %d,value is : %d.\n",hashcode,outer);      }    }  }}//使用磁盘缓存进行hash连接static void hash_join_ondisk(FILE *outerfile,FILE *innerfile){  printf("----------- hash_join_ondisk ---------- \n");  char buffer[MAX_BYTES];  int flag = 0;  //创建hash"桶"文件  flag = generate_bucket(innerfile,"inner");  if(!flag)  {    printf("Can not generate inner bucket file!\n");    return;  }  flag = generate_bucket(outerfile,"outer");  if(!flag)  {    printf("Can not generate outer bucket file!\n");    return;  }  //遍历hash值相同的文件,执行连接  for(int i=0;i < HASH_BUCKET;i++)  {    //从0号桶开始    char innerfname[MAX_BYTES];    char outerfname[MAX_BYTES];    //读文件    sprintf(innerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","inner",i);    sprintf(outerfname,"/cygdrive/d/tmp/hash/%s_%d.csv","outer",i);    FILE *fd_inner = fopen(innerfname,"r");    if(fd_inner == NULL){      //printf("Can not open file : %s\n",filename);      continue;    }    FILE *fd_outer = fopen(outerfname,"r");    if(fd_outer == NULL)    {      continue;    }    for(;!feof(fd_outer);)    {      int v_out = read_int(fd_outer,buffer);      if(v_out == 0)        continue;      for(;!feof(fd_inner);)      {        int v_in = read_int(fd_inner,buffer);        if(v_in == 0)          continue;        if(v_out == v_in)        {          printf("Found one,hash bucket is %d,value is : %d.\n",i,v_out);                }      }      rewind(fd_inner);    }  }}//执行Hash连接void hash_join(char *file1,char * file2,char *flag){  printf("----------- hash join ---------- \n");  FILE *outerfile = fopen(file1,"r");  if(outerfile == NULL)  {    printf("Can not open file %s.\n",file1);    return;  }  //打开第二个文件  FILE *innerfile = fopen(file2,"r");  if(innerfile == NULL)  {    printf("Can not open file %s.\n",file2);    return;  }  //执行JOIN  if(strcmp(flag,"memory") == 0)    hash_join_onmemory(outerfile,innerfile);  else    hash_join_ondisk(outerfile,innerfile);  //关闭  fclose(outerfile);  fclose(innerfile);}

运行输出

$ cat file1.csv1234512342939900220$ cat file2.csv1120340555023433901$ /cygdrive/d/tmp/test.exe file1.csv file2.csv------------- use memory ----------------------------- hash join --------------------- hash_join_onmemory --------------------- generate_bucket --------------------- load_hashtable ----------Found one,hash bucket is 1,value is : 1.Found one,hash bucket is 3,value is : 3.Found one,hash bucket is 1,value is : 1.Found one,hash bucket is 106,value is : 234.Found one,hash bucket is 20,value is : 20.------------- use disk ----------------------------- hash join --------------------- hash_join_ondisk --------------------- generate_bucket --------------------- generate_bucket ----------Found one,hash bucket is 1,value is : 1.Found one,hash bucket is 1,value is : 1.Found one,hash bucket is 3,value is : 3.Found one,hash bucket is 20,value is : 20.Found one,hash bucket is 106,value is : 234.

"分析数据库实现原理"的内容就介绍到这里了,感谢大家的阅读。如果想了解更多行业相关的知识可以关注网站,小编将为大家输出更多高质量的实用文章!

0