大数据实验任务一

大数据实验任务之筛选文件并合并

AmwQqwWmg

886人浏览 · 2022-10-05 22:50:48

AmwQqwWmg · 2022-10-05 22:50:48 发布

题目：假设在HDFS下有几个文件，分别是file1.txt、file2.txt、file3.txt、file4.abc、file5.abc，这里需要从目录中过滤出所有后缀不为.abc的文件，对过滤之后的文件进行读取，并将这些文件的内容合并到文件merge.txt中。

文章目录

一、新建Java的maven工程
二、配置环境
- 1、配置坐标（pom.xml）
- 2. 配置日志（log4j.properties）
三、编写程序代码(merge.java)
四、运行程序

一、新建Java的maven工程

在这里插入图片描述

二、配置环境

在这里插入图片描述

1、配置坐标（pom.xml）

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.hqp</groupId>
    <artifactId>BigData</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>3.1.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.velocity</groupId>
            <artifactId>velocity-tools</artifactId>
            <version>2.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>2.4.14</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.glassfish</groupId>
            <artifactId>javax.el</artifactId>
            <version>3.0.1-b12</version>
        </dependency>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
        </dependency>
        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>24.0-jre</version>
        </dependency>
        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.30</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-auth</artifactId>
            <version>3.1.2</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.10.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.1.1</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>
                            jar-with-dependencies
                        </descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

2. 配置日志（log4j.properties）

log4j.rootLogger=debug,stdout,R

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout

# Pattern to output the caller's file name and line number.
log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n

log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.File=example.log

log4j.appender.R.MaxFileSize=100KB
# Keep one backup file
log4j.appender.R.MaxBackupIndex=5

log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%p %t %c - %m%n

三、编写程序代码(merge.java)

package org.hqp.task.task1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.io.IOUtils;

import java.io.IOException;
import java.net.URI;

public class merge {
    static FileSystem fs = null;

    // 创建连接，得到hdfs
    public static void init() throws Exception {
        //填入hadoop配置文件core-site.xml中的fs.defaultFS值即可
        URI uri = URI.create("hdfs://node0:9000");
        Configuration configuration = new Configuration();
        //第三个参数输入用户名，与hdfs-site.xml的hadoop.http.staticuser.user的值即可
        fs = FileSystem.get(uri, configuration, "hqp");
    }

    public static void destroy() throws IOException {
        //关闭文件系统
        fs.close();
    }

    //筛选文件并合并到一个新的的文件
    public static void combine(Path path) throws IOException {
        // 在输入的文件目录下新建一个merge.txt用以存放合并后的内容
        String res = "/merge.txt";
        String curPath = path.toString() + res;
        fs.create(new Path(curPath));
        // 得到merge.txt的输出流用以写入
        FSDataOutputStream fso = fs.create(new Path(curPath));
        // 遍历输入目录下所有的文件
        for (FileStatus file : fs.listStatus(path)) {
            // 得到文件名，getPath得到的是文件全路径，此时得到文件名即可
            String fname = file.getPath().getName();
            // 通过拆分文件名得到文件后缀
            String suffix = fname.split("\\.")[1];
            if (suffix.equals("txt") && !fname.equals(res.substring(1))) {
                // 得到符合要求的文件（txt文件），获得其输入流用以复制
                FSDataInputStream fsi = fs.open(file.getPath());
                // 将输出流写入merge.txt的输出流
                IOUtils.copyBytes(fsi, fso, 1024);
                // 关闭文件的输入流
                fsi.close();
            }
        }
        // 关闭merge.txt的输出流
        fso.close();
    }

    public static void main(String[] args) throws Exception {
        init();
        combine(new Path(args[0]));
        destroy();
    }
}