Sunday, 17 April 2016

Split text file using Spring Batch


Simple code to split a big text file into multiple text files based on a delimiter or a string.




SplitFile.java

package com.fiam.gcr.batch.bean;

import java.util.List;

public class SplitFile {
private List<String> fileLines;
private String fileName;
private Integer fileCount;


public String getFileName() {
return fileName;
}

public void setFileName(String fileName) {
this.fileName = fileName;
}

public List<String> getFileLines() {
return fileLines;
}

public void setFileLines(List<String> fileLines) {
this.fileLines = fileLines;
}

public Integer getFileCount() {
return fileCount;
}

public void setFileCount(Integer fileCount) {
this.fileCount = fileCount;
}

}







GcrTextFileProcessor.java

package com.fiam.gcr.batch.processor;

import java.util.List;

import org.springframework.batch.core.JobExecution;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.annotation.BeforeStep;
import org.springframework.batch.item.ItemProcessor;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Component;

import com.fiam.gcr.batch.bean.SplitFile;

public class GcrTextFileProcessor implements ItemProcessor<SplitFile, SplitFile> {

@Override
public SplitFile process(SplitFile inputSplitFile) throws Exception {
SplitFile outputSplitFile = new SplitFile();
outputSplitFile = inputSplitFile;
List<String> lines = inputSplitFile.getFileLines();
if(lines.size()>0){
System.out.println("Processing- "+inputSplitFile.getFileLines().get(0).substring(5, 9));
outputSplitFile.setFileName(inputSplitFile.getFileLines().get(0).substring(5, 9));
}
return outputSplitFile;
}

}








GcrTextFileReader.java

package com.fiam.gcr.batch.reader;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;

import org.springframework.batch.core.JobExecution;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.annotation.AfterStep;
import org.springframework.batch.core.annotation.BeforeJob;
import org.springframework.batch.core.annotation.BeforeRead;
import org.springframework.batch.item.ItemReader;
import org.springframework.batch.item.ParseException;
import org.springframework.batch.item.UnexpectedInputException;

import com.fiam.gcr.batch.bean.SplitFile;
import com.fiam.gcr.batch.util.Constants;

public class GcrTextFileReader implements ItemReader<SplitFile>{
private BufferedReader bin;
private Integer count;
private List<String> allLines = new ArrayList<String>();
public GcrTextFileReader(){
try {
FileReader fr = new FileReader(Constants.inputFolder+Constants.inputFile);
this.bin = new BufferedReader(fr);
this.count = 0;
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
@Override
public SplitFile read() throws Exception, UnexpectedInputException,
ParseException {
System.out.println("Reading");
String text = bin.readLine();
SplitFile splitFile = new SplitFile();
String line ="";
while(text!=null){
line=text;
if(line.startsWith("FDXXX") && count==0){
allLines = new ArrayList<String>();
allLines.add(line);
text = bin.readLine();
count++;
}else if(line.startsWith("FDXXX") && count>0){
splitFile.setFileLines(allLines);
allLines = new ArrayList<String>();
allLines.add(line);
count++;
return splitFile;
}else{
allLines.add(line);
text = bin.readLine();
}
/*
if(line.startsWith("FDXXX")){
if (count>0) {
splitFile.setFileLines(allLines);
count++;
allLines = new ArrayList<String>();
allLines.add(line);
return splitFile;
}
count++;
text = bin.readLine();

}
else{
allLines.add(line);
text = bin.readLine();
}
*/
}
if(allLines.size()>0){
splitFile.setFileLines(allLines);
allLines = new ArrayList<String>();
count++;
return splitFile;
}

count = 0;
return null;
}

}








App.java

package com.fiam.gcr.batch.runnable;

import java.io.File;

import org.apache.commons.io.FileUtils;
import org.springframework.batch.core.Job;
import org.springframework.batch.core.JobExecution;
import org.springframework.batch.core.JobParameter;
import org.springframework.batch.core.JobParameters;
import org.springframework.batch.core.launch.JobLauncher;
import org.springframework.context.ApplicationContext;
import org.springframework.context.support.ClassPathXmlApplicationContext;

import com.fiam.gcr.batch.util.Constants;

public class App {
public static void main(String[] args) {

String[] springConfig  = 
"spring/batch/config/applicationContext.xml",
"spring/batch/config/jobConfig.xml" 
};
ApplicationContext context = 
new ClassPathXmlApplicationContext(springConfig);
JobLauncher jobLauncher = (JobLauncher) context.getBean("jobLauncher");
Job job = (Job) context.getBean("clean_split");

try {
JobExecution execution = jobLauncher.run(job, new JobParameters());
System.out.println("Delete Status : " + execution.getExecutionContext().getString("deleteStatus"));
System.out.println("Total Files created : " + execution.getExecutionContext().getInt("totalFilesCreated"));
System.out.println("Validation : " + execution.getExecutionContext().getString("validationStatus"));
System.out.println("Exit Status : " + execution.getStatus());

} catch (Exception e) {
e.printStackTrace();
}

System.out.println("Done");

}
}









CleanDirectory.java

package com.fiam.gcr.batch.tasklet;

import java.io.File;

import org.apache.commons.io.FileUtils;
import org.springframework.batch.core.StepContribution;
import org.springframework.batch.core.scope.context.ChunkContext;
import org.springframework.batch.core.step.tasklet.Tasklet;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.core.io.Resource;
import org.springframework.util.Assert;

import com.fiam.gcr.batch.util.Constants;

public class CleanDirectory implements Tasklet, InitializingBean{
private Resource directory;

 @Override
 public void afterPropertiesSet() throws Exception {
Assert.notNull(directory, "directory must be set");
 }
@Override
public RepeatStatus execute(StepContribution contribution, 
              ChunkContext chunkContext) throws Exception {

File directory = new File(Constants.outputFolder);
FileUtils.cleanDirectory(directory);
chunkContext
        .getStepContext()
        .getStepExecution()
        .getJobExecution()
        .getExecutionContext()
        .put("deleteStatus", "SUCCESS");
System.out.println("Deleted all existing files.");
return RepeatStatus.FINISHED;
 }

 public Resource getDirectory() {
return directory;
 }

 public void setDirectory(Resource directory) {
this.directory = directory;
 }

}








SplitValidator.java

package com.fiam.gcr.batch.tasklet;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.springframework.batch.core.StepContribution;
import org.springframework.batch.core.scope.context.ChunkContext;
import org.springframework.batch.core.step.tasklet.Tasklet;
import org.springframework.batch.repeat.RepeatStatus;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.util.Assert;

import com.fiam.gcr.batch.util.Constants;

public class SplitValidator implements Tasklet, InitializingBean{
private String outputFolder;
private String inputFolder;
private String inputFileName;

 @Override
 public void afterPropertiesSet() throws Exception {
Assert.notNull(outputFolder, "outputFolder value must be set");
 }
@Override
public RepeatStatus execute(StepContribution contribution, 
              ChunkContext chunkContext) throws Exception {
System.out.println("Validating...");
FileReader fr = new FileReader(inputFolder+inputFileName);
BufferedReader bin = new BufferedReader(fr);
String line="";
List<String> accountList = new ArrayList<String>();
Set<String> allAccounts = new HashSet<String>();
while ((line = bin.readLine()) != null) {
if(line.startsWith("FDXXX")){
if (line.length()>8) {
accountList.add(line.substring(5, 9));
allAccounts.add(line.substring(5, 9));
}
}
}
System.out.println("Accounts to be processed : "+allAccounts);
List<String> filesCreated = new ArrayList<String>();


File[] files = new File(outputFolder).listFiles();
//If this pathname does not denote a directory, then listFiles() returns null. 

for (File file : files) {
   if (file.isFile()) {
    filesCreated.add(FilenameUtils.removeExtension(file.getName()));
   }
}
System.out.println("Files created : "+filesCreated);
if(allAccounts.size()==filesCreated.size()){
chunkContext
       .getStepContext()
       .getStepExecution()
       .getJobExecution()
       .getExecutionContext()
       .put("validationStatus", "SUCCESS");
}else{
chunkContext
       .getStepContext()
       .getStepExecution()
       .getJobExecution()
       .getExecutionContext()
       .put("validationStatus", "FAILED");
}
return RepeatStatus.FINISHED;
}

public void setOutputFolder(String outputFolder) {
this.outputFolder = outputFolder;
}

public void setInputFolder(String inputFolder) {
this.inputFolder = inputFolder;
}

public void setInputFileName(String inputFileName) {
this.inputFileName = inputFileName;
}


}









Constants.java

package com.fiam.gcr.batch.util;

public class Constants {
public static final String inputFolder ="F:\\files\\GCR\\input\\";
public static final String inputFile ="simplet.txt";
public static final String outputFolder ="F:\\files\\GCR\\output\\";
//public static final String inputFolder ="";
}






GcrTextFileWriter.java

package com.fiam.gcr.batch.writer;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.List;

import org.springframework.batch.core.JobExecution;
import org.springframework.batch.core.StepExecution;
import org.springframework.batch.core.annotation.AfterStep;
import org.springframework.batch.core.annotation.BeforeStep;
import org.springframework.batch.item.ItemWriter;

import com.fiam.gcr.batch.bean.SplitFile;
import com.fiam.gcr.batch.util.Constants;

public class GcrTextFileWriter implements ItemWriter<SplitFile> {
private int totalFilesCreated=0;
@BeforeStep
public void BeforeStep(StepExecution stepExecution){
stepExecution
        .getJobExecution()
        .getExecutionContext()
        .putInt("totalFilesCreated", 0);
System.out.println("Splitting process started");
}
@AfterStep
    public void AfterStep(StepExecution stepExecution) {

        //update
        stepExecution
        .getJobExecution()
        .getExecutionContext()
        .put("totalFilesCreated", totalFilesCreated);
        System.out.println("Splitting process ended.");
        
}
@Override
public void write(List<? extends SplitFile> splitFiles) throws Exception {
if (splitFiles.size()>0) {
for (SplitFile splitFile : splitFiles) {

File file = new File(Constants.outputFolder
+ splitFile.getFileName() + ".txt");

// if file doesnt exists, then create it
if (!file.exists()) {
file.createNewFile();
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);

for (String line : splitFile.getFileLines()) {
bw.write(line);
bw.newLine();
}
bw.close();
totalFilesCreated++;
}

}
System.out.println("__________"+splitFiles.size()+"__________");
}
}

}






applicationContext.xml

<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="
http://www.springframework.org/schema/beans 
http://www.springframework.org/schema/beans/spring-beans-3.2.xsd">

<bean id="transactionManager" class="org.springframework.batch.support.transaction.ResourcelessTransactionManager"/>
    <bean id="jobLauncher" class="org.springframework.batch.core.launch.support.SimpleJobLauncher">
        <property name="jobRepository" ref="jobRepository"/>
    </bean>
    <bean id="jobRepository" class="org.springframework.batch.core.repository.support.MapJobRepositoryFactoryBean">
        <property name="transactionManager" ref="transactionManager"/>
    </bean>
    <bean id="simpleJob" class="org.springframework.batch.core.job.SimpleJob" abstract="true">
        <property name="jobRepository" ref="jobRepository" />
    </bean>
</beans>







jobConfig.xml

<beans xmlns="http://www.springframework.org/schema/beans"
xmlns:batch="http://www.springframework.org/schema/batch" 
xmlns:task="http://www.springframework.org/schema/task"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://www.springframework.org/schema/batch
http://www.springframework.org/schema/batch/spring-batch-2.2.xsd
http://www.springframework.org/schema/beans 
http://www.springframework.org/schema/beans/spring-beans-3.2.xsd">

<import resource="applicationContext.xml"/>

  <bean id="gcrTextFileReader" class="com.fiam.gcr.batch.reader.GcrTextFileReader" />
 
  <bean id="gcrTextFileProcessor" class="com.fiam.gcr.batch.processor.GcrTextFileProcessor" scope="step"/>
 
    <bean id="gcrTextFileWriter" class="com.fiam.gcr.batch.writer.GcrTextFileWriter" /> 
  <!--   
    <batch:job id="testJob" job-repository="jobRepository" parent="simpleJob">
    <batch:step id="step1">
    <batch:tasklet transaction-manager="transactionManager">
    <batch:chunk reader="gcrTextFileReader" processor="gcrTextFileProcessor" writer="gcrTextFileWriter" commit-interval="2"/>
    </batch:tasklet>
    </batch:step>
    </batch:job> 
     -->
    
  <!--    <batch:job id="testJob" job-repository="jobRepository" parent="simpleJob">
    <batch:step id="step1">
    <batch:tasklet transaction-manager="transactionManager">
    <batch:chunk reader="gcrTextFileReader" processor="gcrTextFileProcessor" writer="gcrTextFileWriter" commit-interval="2"/>
    </batch:tasklet>
    </batch:step>
    </batch:job>
     --> 
  
   <job id="clean_split" job-repository="jobRepository" parent="simpleJob" xmlns="http://www.springframework.org/schema/batch">
<step id="cleanDir" next="splitFiles">
<tasklet ref="cleanDirTasklet" />
</step>
<step id="splitFiles" next="validateSplit">
<tasklet transaction-manager="transactionManager">
<chunk reader="gcrTextFileReader" processor="gcrTextFileProcessor" writer="gcrTextFileWriter" commit-interval="3" />
</tasklet>
</step>
<step id="validateSplit">
<tasklet ref="validateSplitTasklet" />
</step>
  </job>
  
  
  <bean id="cleanDirTasklet" class="com.fiam.gcr.batch.tasklet.CleanDirectory" >
<property name="directory" value="F:\\files\\GCR\\output\\" />
  </bean>
  
  <bean id="validateSplitTasklet" class="com.fiam.gcr.batch.tasklet.SplitValidator" >
<property name="outputFolder" value="F:\\files\\GCR\\output\\" />
<property name="inputFolder" value="F:\\files\\GCR\\input\\" />
<property name="inputFileName" value="simplet.txt" />
  </bean>
  
</beans>




pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.mkyong</groupId>
<artifactId>SpringBatchExample</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<name>SpringBatchExample</name>
<url>http://maven.apache.org</url>

<properties>
<jdk.version>1.6</jdk.version>
<spring.version>3.2.2.RELEASE</spring.version>
<spring.batch.version>3.0.6.RELEASE</spring.batch.version>
<mysql.driver.version>5.1.25</mysql.driver.version>
</properties>

<dependencies>

<!-- Spring Core -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-core</artifactId>
<version>${spring.version}</version>
</dependency>

<!-- Spring Core -->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-jdbc</artifactId>
<version>${spring.version}</version>
</dependency>

<!-- Spring Batch dependencies -->
<dependency>
<groupId>org.springframework.batch</groupId>
<artifactId>spring-batch-core</artifactId>
<version>${spring.batch.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.batch</groupId>
<artifactId>spring-batch-infrastructure</artifactId>
<version>${spring.batch.version}</version>
</dependency>

<!-- MySQL database driver -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.driver.version}</version>
</dependency>

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>


</dependencies>
<build>
<finalName>spring-batch</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<version>2.9</version>
<configuration>
<downloadSources>true</downloadSources>
<downloadJavadocs>false</downloadJavadocs>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>${jdk.version}</source>
<target>${jdk.version}</target>
</configuration>
</plugin>
</plugins>
</build>

</project>






Source file can be downloaded from here