千家信息网

hbase手动compact与split

发表于:2025-01-19 作者:千家信息网编辑
千家信息网最后更新 2025年01月19日,#!/bin/bashdie () { echo >&2 "$@" echo "usage:" echo " $0 check|split table_name [spl
千家信息网最后更新 2025年01月19日hbase手动compact与split
#!/bin/bashdie () {    echo >&2 "$@"    echo "usage:"    echo "       $0 check|split table_name [split_size]"    exit 1}[[ "$#" -lt 2 ]] && die "at least 2 arguments required, $# provided"COMMAND=$1TABLE=$2SIZE="${3:-1073741824}"split() {    region_key=`python /home/hduser/hbase/hbase-scan.py -t hbase:meta -f "RowFilter (=, 'substring:$1')"`    echo "split '$region_key'" | hbase shell}if [ "$COMMAND" != "check" ] ; then    for region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}`    do        [[ ${region##*/} =~ ^\. ]] && continue        [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && split ${region##*/}    done    # check after split    sleep 60fifor region in `hadoop fs -ls /hbase/data/default/$TABLE | awk {'print $8'}`do    [[ ${region##*/} =~ ^\. ]] && continue    [[ `hadoop fs -du -s $region | awk {'print $1'}` -gt $SIZE ]] && echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a huge region" || echo "${region##*/} (`hadoop fs -du -s -h $region | awk {'print $1 $2'}`) is a small region"done


hbase-scan.py

import subprocessimport datetimeimport argparseimport csvimport gzipimport happybaseimport loggingdef connect_to_hbase():    return happybase.Connection('itr-hbasetest01')def main():    logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s: %(message)s',level=logging.INFO)    argp = argparse.ArgumentParser(description='EventLog Reader')    argp.add_argument('-t','--table', dest='table', default='eventlog')    argp.add_argument('-p','--prefix', dest='prefix')    argp.add_argument('-f','--filter', dest='filter')    argp.add_argument('-l','--limit', dest='limit', default=10)    args = argp.parse_args()    hbase_conn = connect_to_hbase()    table = hbase_conn.table(args.table)    logging.info("scan start")    scanner = table.scan(row_prefix=args.prefix, batch_size=1000, limit=int(args.limit), filter=args.filter)    logging.info("scan done")    i = 0    for key, data in scanner:        logging.info(key)        print key        i+=1    logging.info('%s rows read in total', i)if __name__ == '__main__':    main()


0