Maven工程的MapReduce程序4

您所在的位置:网站首页 0号分区与1号分区重叠 Maven工程的MapReduce程序4

Maven工程的MapReduce程序4

2024-05-26 13:26| 来源: 网络整理| 查看: 265

MapReduce序列化、分区实验

有一张员工表emp.csv,内容如下:

SAL:为员工工资

7369,SMITH,CLERK,7902,1980/12/17,800,,20 7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30 7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30 7566,JONES,MANAGER,7839,1981/4/2,2975,,20 7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30 7698,BLAKE,MANAGER,7839,1981/5/1,2850,,30 7782,CLARK,MANAGER,7839,1981/6/9,2450,,10 7788,SCOTT,ANALYST,7566,1987/4/19,3000,,20 7839,KING,PRESIDENT,,1981/11/17,5000,,10 7844,TURNER,SALESMAN,7698,1981/9/8,1500,0,30 7876,ADAMS,CLERK,7788,1987/5/23,1100,,20 7900,JAMES,CLERK,7698,1981/12/3,950,,30 7902,FORD,ANALYST,7566,1981/12/3,3000,,20 7934,MILLER,CLERK,7782,1982/1/23,1300,,10

根据如上emp.csv表,假设:

薪资=1500,而且薪资=3000,为高薪。

问题:编写程序实现将对员工数据按低薪、中薪、高薪进行分区存储,输出到三个文件。

要求:职工信息采用一个独立的类存放,并且实现Hadoop序列化。

 

本实验是在案例四的基础上进行分析:

由以上分析,一共有5个类

新建Maven工程,配置好pom.xml(参考案例二),建立相应的5各类。

  参考代码:

Employee.java

package com.myPatition2; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; //定义Employee类实现序列化接口 public class Employee implements Writable{ //字段名 EMPNO, ENAME, JOB, MGR, HIREDATE, SAL, COMM, DEPTNO //数据类型:Int,Char, Char , Int, Date , Int Int, Int //数据: 7654, MARTIN, SALESMAN, 7698, 1981/9/28, 1250, 1400, 30 //由以上定义变量 private int empno; private String ename; private String job; private int mgr; private String hiredate; private int sal; private int comm;//奖金 private int deptno; @Override public String toString() { // return "Employee [empno=" + empno + ", ename=" + ename + ", sal=" + sal + ", deptno=" + deptno + "]"; return empno+","+ename+","+job+","+mgr+","+hiredate+","+sal+","+comm+","+deptno; } //序列化方法:将java对象转化为可跨机器传输数据流(二进制串/字节)的一种技术 public void write(DataOutput out) throws IOException { out.writeInt(this.empno); out.writeUTF(this.ename); out.writeUTF(this.job); out.writeInt(this.mgr); out.writeUTF(this.hiredate); out.writeInt(this.sal); out.writeInt(this.comm); out.writeInt(this.deptno); } //反序列化方法:将可跨机器传输数据流(二进制串)转化为java对象的一种技术 public void readFields(DataInput in) throws IOException { this.empno = in.readInt(); this.ename = in.readUTF(); this.job = in.readUTF(); this.mgr = in.readInt(); this.hiredate = in.readUTF(); this.sal = in.readInt(); this.comm = in.readInt(); this.deptno = in.readInt(); } //其他类通过set/get方法操作变量:Source-->Generator Getters and Setters public int getEmpno() { return empno; } public void setEmpno(int empno) { this.empno = empno; } public String getEname() { return ename; } public void setEname(String ename) { this.ename = ename; } public String getJob() { return job; } public void setJob(String job) { this.job = job; } public int getMgr() { return mgr; } public void setMgr(int mgr) { this.mgr = mgr; } public String getHiredate() { return hiredate; } public void setHiredate(String hiredate) { this.hiredate = hiredate; } public int getSal() { return sal; } public void setSal(int sal) { this.sal = sal; } public int getComm() { return comm; } public void setComm(int comm) { this.comm = comm; } public int getDeptno() { return deptno; } public void setDeptno(int deptno) { this.deptno = deptno; } }

注意:Employee类 ,要重写toString()方法,构造出Reduce所要的输出。

 

SalaryTotalMapper

package com.myPatition2; import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class SalaryTotalMapper extends Mapper< LongWritable, Text, NullWritable, Employee> { @Override protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException { //数据:7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30 String data = v1.toString(); String[] words = data.split(","); //创建员工对象 Employee emp = new Employee(); //设置员工属性 emp.setEmpno(Integer.parseInt(words[0])); emp.setEname(words[1]); emp.setJob(words[2]); try { emp.setMgr(Integer.parseInt(words[3]));//可能为空,加try...catch包围 } catch (NumberFormatException ex) { ex.printStackTrace(); } emp.setHiredate(words[4]); emp.setSal(Integer.parseInt(words[5])); try { emp.setComm(Integer.parseInt(words[6]));//可能为空 } catch (NumberFormatException ex) { ex.printStackTrace(); } emp.setDeptno(Integer.parseInt(words[7])); //取出部门号words[7],将String转换为Int,Int转换为IntWritable对象,赋值为k2 NullWritable k2 = NullWritable.get(); //取出工资words[5],将String转换为Int,Int转换为IntWritable对象,赋值为v2 Employee v2 = emp; //输出k2, v2 context.write(k2, v2); } }

 SalaryTotalReducer.java

package com.myPatition2; import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class SalaryTotalReducer extends Reducer { @Override protected void reduce(NullWritable k3, Iterable v3, Context context) throws IOException, InterruptedException { String line=null; for (Employee v : v3) { line = v.toString(); context.write(k3, new Text(line)); } } }

SalaryTotalMain.java 

 

package com.myPatition2; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class SalaryTotalMain { public static void main(String[] args) throws Exception { //1. 创建一个job和任务入口(指定主类) Job job = Job.getInstance(); job.setJarByClass(SalaryTotalMain.class); //2. 指定job的mapper和输出的类型 job.setMapperClass(SalaryTotalMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Employee.class); //这里有变化: //指定任务的分区规则的类 job.setPartitionerClass(SalaryTotalPartitioner.class); //指定建立几个分区 job.setNumReduceTasks(3); //3. 指定job的reducer和输出的类型 job.setReducerClass(SalaryTotalReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); //4. 指定job的输入和输出路径 FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); //5. 执行job job.waitForCompletion(true); } }

SalaryTotalPartitioner.java 

package com.myPatition2; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Partitioner; // map-outputs:k2,v2-->NullWritable, Employee public class SalaryTotalPartitioner extends Partitioner{ @Override public int getPartition(NullWritable k2, Employee v2, int numPatition) { //如何分区: 每个部门放在一个分区 if(v2.getSal() < 1500) { //放入1号分区中 return 1%numPatition;// 1%3=1 }else if(v2.getSal() >=1500 && v2.getSal() < 3000){ //放入2号分区中 return 2%numPatition;// 2%3=2 }else { //放入3号分区中 return 3%numPatition;// 3%3=0 } } }

写好代码后,打成jar包,提交到hadoop去执行

查看输出文件如下 :除了_SUCCESS外,有三个输出文件。

查看输出结果,确实是按照薪水等级来存放的员工数据的,说明程序已正确。 

 

完成! enjoy it! 



【本文地址】


今日新闻


推荐新闻


CopyRight 2018-2019 办公设备维修网 版权所有 豫ICP备15022753号-3