Java-API-MapReduce的操作WordCount篇

芒果2年前技术文章848

首先就是pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>org.example</groupId>
    <artifactId>HDFS_Demo</artifactId>
    <version>1.0-SNAPSHOT</version>
    <properties>
        <maven.compiler.source>11</maven.compiler.source>
        <maven.compiler.target>11</maven.compiler.target>
        <hadoop.version>3.3.1</hadoop.version>
    </properties>
    <dependencies>
        <!-- Hadoop所需依赖包 -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <!--mapreduce-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-core</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
    </dependencies>
</project>
代码(idea跑本地文件的)
package com.mapreduce;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
    //静态内部类(静态内部类只能访问外部类的静态成员)
    public static class WordCountMapper extends Mapper<LongWritable, Text, Text,IntWritable>{
        // #2
        private Text mapOutPutKey=new Text();
        private final static IntWritable mapOutPutValue = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context)
                throws IOException, InterruptedException {
            // #1 方式1
//            String [] words = value.toString().split(" ");//直接split性能较低,或参考
//            for (String word : words) {
//                context.write(new Text(word), new IntWritable(1));
//            }
            // #2 方式2 效率高
            StringTokenizer stringTokenizer = new StringTokenizer(value.toString());
            while(stringTokenizer.hasMoreTokens()){
                String wordValue = stringTokenizer.nextToken();
                mapOutPutKey.set(wordValue);
                context.write(mapOutPutKey,mapOutPutValue);
            }
        }
    }
    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        // 注意 reduce的输入类型是个迭代器 Iterable<IntWritable> value,因为map将分组后的数据传过来,map会做group,将相同key的value合并在一起,放到一个集合中,如<hadoop,list(1,1,...)>
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable intWritable : values) {
                // total
                sum += intWritable.get();
            }
            context.write(key, new IntWritable(sum));//或 new IntWritable().set(sum)
        }
    }
    //dirver(将dirver提出来了)
    public int run(String[] args) throws Exception {
        //1.get configuration
        Configuration conf = new Configuration();
//        conf.set("mapreduce.framework.name", "local");
        //2.create job
        Job job = Job.getInstance(conf,this.getClass().getSimpleName());
        // run jar
        job.setJarByClass(WordCount.class);
        //3.set job (input -> map -> reduce -> output)
        //3.1 map
        job.setMapperClass(WordCountMapper.class);
        //设置Map端输出key类和输出value类
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        //3.2 reduce
        job.setReducerClass(WordCountReducer.class);
        //设置Reduce端输出key类和输出value类
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        //3.3
        //input path
        FileInputFormat.addInputPath(job,new Path("/Users/mac/Desktop/客户/源码/HDFS_Demo/src/main/resources/input/test_dfs.txt"));
        //output path
        FileOutputFormat.setOutputPath(job,new Path("/Users/mac/Desktop/客户/源码/HDFS_Demo/src/main/resources/output"));
        //submit job 提交任务
        boolean isSuccess = job.waitForCompletion(true);//表示打印日志信息
        System.out.println(isSuccess);
        return isSuccess ? 0 : 1;
    }
    // run program 运行整个工程
    public static void main(String[] args) throws Exception, ClassNotFoundException, InterruptedException {
        int status = new WordCount().run(args);
        // 结束程序
        System.exit(status);
    }
    }
注意路径

33AD446E-87CB-4A89-9233-1AD7EFFEE9C0.png
如果需要跑hadoop文件需要更改路径打包上传hadoop
Hdfs创建目录
创建目录:
hdfs dfs -mkdir -p /input
删除目录:
hdfs dfs -rm -r /output
提交jar
hadoop jar HDFS_Demo-1.0-SNAPSHOT.jar com.mapreduce.WordCount

image.png

返回列表

上一篇:ubuntu安装mysql

下一篇:Dockerfile

相关文章

Hive架构图及Hive SQL的执行流程

Hive架构图及Hive SQL的执行流程

1、Hive产生背景MapReduce编程的不便性HDFS上的文件缺少Schema(表名,名称,ID等,为数据库对象的集合)2、Hive是什么Hive的使用场景是什么?基于Hadoop做一些数据清洗啊...

Ambari开启kerberos+安全方案

Ambari开启kerberos+安全方案

安装Kerberosserver节点安装kerberos相关软件yum install -y krb5-server krb5-workstation krb5-libsclient节点安装yum i...

HDP部署Tez UI

HDP部署Tez UI

首先部署tomcat在官网下载apache-tomcat-9.0.22.tar.gz[root@hdp02 opt]# wget https://archive.apache.org/dist/tom...

正式发布 | 《云运维服务白皮书》开放下载!

正式发布 | 《云运维服务白皮书》开放下载!

在全球数字化变革的背景下,为适应数字经济环境下企业生存发展和市场变化的需要,企业进行主动的、系统性、整体性的数字化转型升级。大数据、云计算、人工智能、区块链等新一代信息通信技术为企业的数字化转型提供了...

ES运维(二)字段类型与内存管理

ES运维(二)字段类型与内存管理

一、ES常见字段类型1、 概述字段是数据存储的最小微粒,根据数据的性质不同将数据分成不同的字段类型,熟悉不同字段类型的特性,对索引的Mapping设计、查询调优都极其重要。2、 关键参数In...

深度解读|云掣《云运维服务白皮书》全方位解析!

深度解读|云掣《云运维服务白皮书》全方位解析!

如今全球各行各业纷纷进行数字化变革,为适应数字经济环境下企业生存发展和市场变化的需要,企业选择进行主动的、系统性、整体性的数字化转型升级。大数据、云计算、人工智能、区块链等新一代信息通信技术为企业的数...

发表评论    

◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。