千家信息网

hive UDF如何通过IP地址获取 IDC/省份/城市

发表于:2024-09-22 作者:千家信息网编辑
千家信息网最后更新 2024年09月22日,这篇文章将为大家详细讲解有关hive UDF如何通过IP地址获取 IDC/省份/城市,小编觉得挺实用的,因此分享给大家做个参考,希望大家阅读完这篇文章后可以有所收获。简述:简单的2分查找算法,根据IP
千家信息网最后更新 2024年09月22日hive UDF如何通过IP地址获取 IDC/省份/城市

这篇文章将为大家详细讲解有关hive UDF如何通过IP地址获取 IDC/省份/城市,小编觉得挺实用的,因此分享给大家做个参考,希望大家阅读完这篇文章后可以有所收获。

简述:

简单的2分查找算法,根据IP地址定位IP所属的IP段,然后获取IP段的IDC/省份/城市的信息。

输入:IP地理信息文件,一般地址库拿到后需要格式化一下,参考:
1. 如果省份是null 或者 '' ,city是null或者'' =》 省份=其他 and 城市=其他
2. 省份非空且为直辖市,但是城市非直辖市 = 》 城市=直辖市
3. 省份非空且非直辖市,但是城市为空 =》城市=其他
4. 省份或城市中有(、\等非法信息 = 》 省份=其他 and 城市=其他

/user/hadoop/IP.csv

格式:

 编号,开始IP(long),结束IP(long),省份,城市,IDC,开始IP,结束IP    29990,16777472,16778239,福建省,其他,电信,1.0.1.0,1.0.3.255    29991,16779264,16781311,广东省,其他,电信,1.0.8.0,1.0.15.255    29992,16785408,16793599,广东省,其他,电信,1.0.32.0,1.0.63.255

用法 & 输出:

编辑打包或者编译到hive中参考这篇,这里不在多说:http://my.oschina.net/wangjiankui/blog/64230

get_ip_location_new(visitip,'IDC') //返回IDC信息

get_ip_location_new(visitip,'REGION') //返回省份信息

get_ip_location_new(visitip,'CITY') //返回城市信息

代码:

package com.xxx.hive.udf;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStreamReader;import java.net.URI;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.hive.ql.exec.UDF;import org.apache.hadoop.io.IOUtils;import org.apache.hadoop.io.Text;public class UDFGetIPLocationNew extends UDF{  public static List map = new ArrayList();  public static long[] start_from_index;  public static long[] end_to_index;  public static Map idcCache = new HashMap();  public static Map regionCache = new HashMap();  public static Map cityCache = new HashMap();  private void LoadIPLocation()  {    Configuration conf = new Configuration();    String namenode = conf.get("fs.default.name");    String uri = namenode + "/user/hadoop/IP.csv";    FileSystem fs = null;    FSDataInputStream in = null;    BufferedReader d = null;    try    {      fs = FileSystem.get(URI.create(uri), conf);      in = fs.open(new Path(uri));      d = new BufferedReader(new InputStreamReader(in));      String s = null;      while (true)      {        s = d.readLine();        if (s == null) {          break;        }        map.add(s);      }    }    catch (IOException e) {      e.printStackTrace();    } finally {      IOUtils.closeStream(in);    }  }  public static int binarySearch(long[] start, long[] end, long ip)  {    int low = 0;    int high = start.length - 1;    while (low <= high) {      int middle = (low + high) / 2;      if ((ip >= start[middle]) && (ip <= end[middle]))        return middle;      if (ip < start[middle])        high = middle - 1;      else {        low = middle + 1;      }    }    return -1;  }  public static long ip2long(String ip)  {    if (ip.matches("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}")) {      String[] ips = ip.split("[.]");      long ipNum = 0L;      if (ips == null) {        return 0L;      }      for (int i = 0; i < ips.length; i++) {        ipNum = ipNum << 8 | Long.parseLong(ips[i]);      }      return ipNum;    }    return 0L;  }  public String evaluate(Text ip, Text which) {    long ipLong = ip2long(ip.toString());    String whichString = which.toString();    if ((!whichString.equals("IDC")) && (!whichString.equals("REGION")) && (!whichString.equals("CITY")))    {      return "Unknown Args!use(IDC or REGION or CITY)";    }    if (map.size() == 0) {      LoadIPLocation();      start_from_index = new long[map.size()];      end_to_index = new long[map.size()];      for (int i = 0; i < map.size(); i++) {        StringTokenizer token = new StringTokenizer((String)map.get(i), ",");        token.nextToken();        start_from_index[i] = Long.parseLong(token.nextToken());        end_to_index[i] = Long.parseLong(token.nextToken());      }    }    int ipindex = 0;    if (((whichString.equals("IDC")) && (!idcCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals("REGION")) && (!regionCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals("CITY")) && (!cityCache.containsKey(Long.valueOf(ipLong)))))    {      ipindex = binarySearch(start_from_index, end_to_index, ipLong);    }    if (ipindex == 0) {      if (whichString.equals("IDC"))        return (String)idcCache.get(Long.valueOf(ipLong));      if (whichString.equals("REGION"))        return (String)regionCache.get(Long.valueOf(ipLong));      if (whichString.equals("CITY")) {        return (String)cityCache.get(Long.valueOf(ipLong));      }      return "Error";    }    if (ipindex == -1) {      return "Other IDC";    }    String[] location = ((String)map.get(ipindex)).split(",");    if (whichString.equals("IDC")) {      idcCache.put(Long.valueOf(ipLong), location[5]);      return location[5];    }if (whichString.equals("REGION")) {      regionCache.put(Long.valueOf(ipLong), location[3]);      return location[3];    }if (whichString.equals("CITY")) {      cityCache.put(Long.valueOf(ipLong), location[4]);      return location[4];    }    return "Error";  }  public static void main(String[] args)  {    long startTime = System.currentTimeMillis();    System.out.println("now:" + startTime);    UDFGetIPLocationNew getIPLocation = new UDFGetIPLocationNew();    Text ip = new Text("112.122.64.0");    System.out.printf("ip = %s, %s, %s, %s\n", new Object[] { ip, getIPLocation.evaluate(ip, new Text("IDC")), getIPLocation.evaluate(ip, new Text("REGION")), getIPLocation.evaluate(ip, new Text("CITY")) });    long endTime = System.currentTimeMillis();    System.out.println("over:" + endTime);    System.out.println("count:" + (endTime - startTime) * 1.0D / 1000.0D);  }}

关于"hive UDF如何通过IP地址获取 IDC/省份/城市"这篇文章就分享到这里了,希望以上内容可以对大家有一定的帮助,使各位可以学到更多知识,如果觉得文章不错,请把它分享出去让更多的人看到。

0