千家信息网

WordCount On Hadoop怎么实现

发表于:2025-02-06 作者:千家信息网编辑
千家信息网最后更新 2025年02月06日,这篇文章主要介绍"WordCount On Hadoop怎么实现",在日常操作中,相信很多人在WordCount On Hadoop怎么实现问题上存在疑惑,小编查阅了各式资料,整理出简单好用的操作方法
千家信息网最后更新 2025年02月06日WordCount On Hadoop怎么实现

这篇文章主要介绍"WordCount On Hadoop怎么实现",在日常操作中,相信很多人在WordCount On Hadoop怎么实现问题上存在疑惑,小编查阅了各式资料,整理出简单好用的操作方法,希望对大家解答"WordCount On Hadoop怎么实现"的疑惑有所帮助!接下来,请跟着小编一起来学习吧!

官方例子:

WordCount2.java

import java.io.BufferedReader;import java.io.FileReader;import java.io.IOException;import java.net.URI;import java.util.ArrayList;import java.util.HashSet;import java.util.List;import java.util.Set;import java.util.StringTokenizer;import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.Text;import org.apache.hadoop.mapreduce.Job;import org.apache.hadoop.mapreduce.Mapper;import org.apache.hadoop.mapreduce.Reducer;import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import org.apache.hadoop.mapreduce.Counter;import org.apache.hadoop.util.GenericOptionsParser;import org.apache.hadoop.util.StringUtils;public class WordCount2 {    public static class TokenizerMapper extends            Mapper {        static enum CountersEnum {            INPUT_WORDS        }        private final static IntWritable one = new IntWritable(1);        private Text word = new Text();        private boolean caseSensitive;        private Set patternsToSkip = new HashSet();        private Configuration conf;        private BufferedReader fis;        @Override        public void setup(Context context) throws IOException,                InterruptedException {            conf = context.getConfiguration();            caseSensitive = conf.getBoolean("wordcount.case.sensitive", true);            if (conf.getBoolean("wordcount.skip.patterns", false)) {//官方例子为true,若无配置文件将报错,改为false正常。参见:https://issues.apache.org/jira/browse/MAPREDUCE-6038                URI[] patternsURIs = Job.getInstance(conf).getCacheFiles();                for (URI patternsURI : patternsURIs) {                    Path patternsPath = new Path(patternsURI.getPath());                    String patternsFileName = patternsPath.getName().toString();                    parseSkipFile(patternsFileName);                }            }        }        private void parseSkipFile(String fileName) {            try {                fis = new BufferedReader(new FileReader(fileName));                String pattern = null;                while ((pattern = fis.readLine()) != null) {                    patternsToSkip.add(pattern);                }            } catch (IOException ioe) {                System.err                        .println("Caught exception while parsing the cached file '"                                + StringUtils.stringifyException(ioe));            }        }        @Override        public void map(Object key, Text value, Context context)                throws IOException, InterruptedException {            String line = (caseSensitive) ? value.toString() : value.toString()                    .toLowerCase();            for (String pattern : patternsToSkip) {                line = line.replaceAll(pattern, "");            }            StringTokenizer itr = new StringTokenizer(line);            while (itr.hasMoreTokens()) {                word.set(itr.nextToken());                context.write(word, one);                Counter counter = context.getCounter(                        CountersEnum.class.getName(),                        CountersEnum.INPUT_WORDS.toString());                counter.increment(1);            }        }    }    public static class IntSumReducer extends            Reducer {        private IntWritable result = new IntWritable();        public void reduce(Text key, Iterable values,                Context context) throws IOException, InterruptedException {            int sum = 0;            for (IntWritableval : values) {                sum += val.get();            }            result.set(sum);            context.write(key, result);        }    }    public static void main(String[] args) throws Exception {        Configuration conf = new Configuration();        GenericOptionsParser optionParser = new GenericOptionsParser(conf, args);        String[] remainingArgs = optionParser.getRemainingArgs();        if (!(remainingArgs.length != 2 || remainingArgs.length != 4)) {            System.err                    .println("Usage: wordcount   [-skip skipPatternFile]");            System.exit(2);        }        Job job = Job.getInstance(conf, "word count");        job.setJarByClass(WordCount2.class);        job.setMapperClass(TokenizerMapper.class);        job.setCombinerClass(IntSumReducer.class);        job.setReducerClass(IntSumReducer.class);        job.setOutputKeyClass(Text.class);        job.setOutputValueClass(IntWritable.class);        List otherArgs = new ArrayList();        for (int i = 0; i < remainingArgs.length; ++i) {            if ("-skip".equals(remainingArgs[i])) {                job.addCacheFile(new Path(remainingArgs[++i]).toUri());                job.getConfiguration().setBoolean("wordcount.skip.patterns",                        true);            } else {                otherArgs.add(remainingArgs[i]);            }        }        FileInputFormat.addInputPath(job, new Path(otherArgs.get(0)));        FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));        System.exit(job.waitForCompletion(true) ? 0 : 1);    }}
cd /data/programjavac -classpath /home/hadoop/hadoop-2.7.1/share/hadoop/common/hadoop-common-2.7.1.jar:/home/hadoop/hadoop-2.7.1/share/hadoop/mapreduce/hadoop-mapreduce-client-core-2.7.1.jar:/home/hadoop/hadoop-2.7.1/share/hadoop/common/lib/commons-cli-1.2.jar WordCount2.javajar cf wc.jar WordCount*.classcd /home/hadoop/hadoop-2.7.1/bin/hadoop jar wc.jar WordCount2 /program/input /program/output

到此,关于"WordCount On Hadoop怎么实现"的学习就结束了,希望能够解决大家的疑惑。理论与实践的搭配能更好的帮助大家学习,快去试试吧!若想继续学习更多相关知识,请继续关注网站,小编会继续努力为大家带来更多实用的文章!

0