入口类:Engine
Idea中配置基本运行信息:
VM options:-Ddatax.home=D:\app\workspace\idea\DataX\target\datax\datax
program agruments:"-job" "D:\\app\\workspace\\idea\\DataX\\jobJson\\MysqlToMysql.json" "-jobid" "0" "-mode" "standalone"
在代码入口处可以进行打印:
LOG.info("DATAX.HOME:"+System.getProperty("datax.home"));
Engine.entry(args);
主要流程记录
- 判断传入的插件json配置文件的获取地方,读取插件配置json文件的信息,转为json对象
- 获取core.json 信息 合并到 插件配置json文件的json对象中:
{ "common": { "column": { "dateFormat": "yyyy-MM-dd", "datetimeFormat": "yyyy-MM-dd HH:mm:ss", "encoding": "utf-8", "extraFormats": ["yyyyMMdd"], "timeFormat": "HH:mm:ss", "timeZone": "GMT+8" } }, "core": { "container": { "job": { "reportInterval": 10000 }, "taskGroup": { "channel": 5 }, "trace": { "enable": "false" } }, "dataXServer": { "address": "http://localhost:7001/api", "reportDataxLog": false, "reportPerfLog": false, "timeout": 10000 }, "statistics": { "collector": { "plugin": { "maxDirtyNumber": 10, "taskClass": "com.alibaba.datax.core.statistics.plugin.task.StdoutPluginCollector" } } }, "transport": { "channel": { "byteCapacity": 67108864, "capacity": 512, "class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel", "flowControlInterval": 20, "speed": { "byte": -1, "record": -1 } }, "exchanger": { "bufferSize": 32, "class": "com.alibaba.datax.core.plugin.BufferedRecordExchanger" } } }, "entry": { "jvm": "-Xms1G -Xmx1G" }, "job": { "content": [{ "reader": { "name": "mysqlreader", "parameter": { "connection": [{ "jdbcUrl": ["jdbc:mysql://localhost:3306/datax"], "querySql": ["select id,username,telephone from user;", "select id,username,telephone from user_copy;"] }], "password": "root123", "username": "root" } }, "writer": { "name": "streamwriter", "parameter": { "encoding": "UTF-8", "print": true } } }], "setting": { "speed": { "channel": 1 } } } }
- 获取插件配置json信息readerPluginName,writerPluginName --》 加入到pluginList
- 加载插件,过程如下:
获取到reader插件所在的path:D:\app\workspace\idea\DataX\target\datax\datax\plugin\reader
遍历目录下的所有插件
获取每个插件的plugin.json,获取插件的信息比如:
{
"name": "drdsreader",
"class": "com.alibaba.datax.plugin.reader.drdsreader.DrdsReader",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
"developer": "alibaba"
}
去掉不需要的插件信息,加载到最初的json对象中
json对象信息如下:
{
"common": {
"column": {
"dateFormat": "yyyy-MM-dd",
"datetimeFormat": "yyyy-MM-dd HH:mm:ss",
"encoding": "utf-8",
"extraFormats": ["yyyyMMdd"],
"timeFormat": "HH:mm:ss",
"timeZone": "GMT+8"
}
},
"core": {
"container": {
"job": {
"id": 0,
"reportInterval": 10000
},
"taskGroup": {
"channel": 5
},
"trace": {
"enable": "false"
}
},
"dataXServer": {
"address": "http://localhost:7001/api",
"reportDataxLog": false,
"reportPerfLog": false,
"timeout": 10000
},
"statistics": {
"collector": {
"plugin": {
"maxDirtyNumber": 10,
"taskClass": "com.alibaba.datax.core.statistics.plugin.task.StdoutPluginCollector"
}
}
},
"transport": {
"channel": {
"byteCapacity": 67108864,
"capacity": 512,
"class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
"flowControlInterval": 20,
"speed": {
"byte": -1,
"record": -1
}
},
"exchanger": {
"bufferSize": 32,
"class": "com.alibaba.datax.core.plugin.BufferedRecordExchanger"
}
}
},
"entry": {
"jvm": "-Xms1G -Xmx1G"
},
"job": {
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"connection": [{
"jdbcUrl": ["jdbc:mysql://localhost:3306/datax"],
"querySql": ["select id,username,telephone from user;", "select id,username,telephone from user_copy;"]
}],
"password": "root123",
"username": "root"
}
},
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "UTF-8",
"print": true
}
}
}],
"setting": {
"speed": {
"channel": 1
}
}
},
"plugin": {
"reader": {
"mysqlreader": {
"class": "com.alibaba.datax.plugin.reader.mysqlreader.MysqlReader",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
"developer": "alibaba",
"name": "mysqlreader",
"path": "D:\\app\\workspace\\idea\\DataX\\target\\datax\\datax\\plugin\\reader\\mysqlreader"
}
},
"writer": {
"streamwriter": {
"class": "com.alibaba.datax.plugin.writer.streamwriter.StreamWriter",
"description": {
"mechanism": "use datax framework to transport data to stream.",
"useScene": "only for developer test.",
"warn": "Never use it in your real job."
},
"developer": "alibaba",
"name": "streamwriter",
"path": "D:\\app\\workspace\\idea\\DataX\\target\\datax\\datax\\plugin\\writer\\streamwriter"
}
}
}
}
- JobContainer初始化
container = new JobContainer(allConf); JobContainer初始化
container.start(); 启动JobContainer
jobContainer主要负责的工作全部在start()里面,包括init、prepare、split、scheduler、post以及destroy和statistics
init:reader和writer的初始化
//必须先Reader ,后Writer 此处加载 读取和写入的插件 根据json配置文件的信息
this.jobReader = this.initJobReader(jobPluginCollector);
this.jobWriter = this.initJobWriter(jobPluginCollector);
根据json配置文件获取插件具体的class,然后进行加载class,获取具体的插件对象(write类似)
Reader.Job jobReader = (Reader.Job) LoadUtil.loadJobPlugin(
PluginType.READER, this.readerPluginName);
执行具体插件的 init方法
jobReader.init();
prepare:
执行具体插件的 prepare
jobReader.prepare();
this.split();
List<Configuration> readerTaskConfigs = this
.doReaderSplit(this.needChannelNumber);
adviceNumber是框架建议插件切分的任务数,插件开发人员最好切分出来的任务数>=*adviceNumber。
例如用户同步一张Mysql单表,但是认为可以到10并发吞吐量,插件开发人员最好对该表进行切分,比如使用主键范围切分,*并且如果最终切分任务数到>=10,我们就可以提供给用户最大的吞吐量。
int taskNumber = readerTaskConfigs.size();获取切分的任务个数
- 根据切分的规则生成各个任务的配置文件
[{
"internal": {
"reader": {
"name": "mysqlreader",
"parameter": {
"fetchSize": -2147483648,
"isTableMode": false,
"jdbcUrl": "jdbc:mysql://localhost:3306/datax?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true",
"loadBalanceResourceMark": "localhost",
"password": "root123",
"querySql": "select id,username,telephone from user;",
"tableNumber": 0,
"username": "root"
}
},
"taskId": 0,
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "UTF-8",
"print": true
}
}
},
"keys": ["writer.parameter.print", "writer.parameter.encoding", "reader.parameter.fetchSize", "reader.parameter.jdbcUrl", "reader.parameter.password", "writer.name", "reader.parameter.querySql", "reader.parameter.loadBalanceResourceMark", "reader.parameter.tableNumber", "reader.name", "reader.parameter.isTableMode", "reader.parameter.username", "taskId"],
"secretKeyPathSet": []
}, {
"internal": {
"reader": {
"name": "mysqlreader",
"parameter": {
"fetchSize": -2147483648,
"isTableMode": false,
"jdbcUrl": "jdbc:mysql://localhost:3306/datax?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true",
"loadBalanceResourceMark": "localhost",
"password": "root123",
"querySql": "select id,username,telephone from user_copy;",
"tableNumber": 0,
"username": "root"
}
},
"taskId": 1,
"writer": {
"name": "streamwriter",
"parameter": {
"$ref": "$[0].internal.writer.parameter"
}
}
},
"keys": ["writer.parameter.print", "writer.parameter.encoding", "reader.parameter.fetchSize", "reader.parameter.jdbcUrl", "reader.parameter.password", "writer.name", "reader.parameter.querySql", "reader.parameter.loadBalanceResourceMark", "reader.parameter.tableNumber", "reader.name", "reader.parameter.isTableMode", "reader.parameter.username", "taskId"],
"secretKeyPathSet": []
}]
加入到最初的json对象中:
{
"common": {
"column": {
"dateFormat": "yyyy-MM-dd",
"datetimeFormat": "yyyy-MM-dd HH:mm:ss",
"encoding": "utf-8",
"extraFormats": ["yyyyMMdd"],
"timeFormat": "HH:mm:ss",
"timeZone": "GMT+8"
}
},
"core": {
"container": {
"job": {
"id": 0,
"mode": "standalone",
"reportInterval": 10000
},
"taskGroup": {
"channel": 5
},
"trace": {
"enable": "false"
}
},
"dataXServer": {
"address": "http://localhost:7001/api",
"reportDataxLog": false,
"reportPerfLog": false,
"timeout": 10000
},
"statistics": {
"collector": {
"plugin": {
"maxDirtyNumber": 10,
"taskClass": "com.alibaba.datax.core.statistics.plugin.task.StdoutPluginCollector"
}
}
},
"transport": {
"channel": {
"byteCapacity": 67108864,
"capacity": 512,
"class": "com.alibaba.datax.core.transport.channel.memory.MemoryChannel",
"flowControlInterval": 20,
"speed": {
"byte": -1,
"record": -1
}
},
"exchanger": {
"bufferSize": 32,
"class": "com.alibaba.datax.core.plugin.BufferedRecordExchanger"
}
}
},
"entry": {
"jvm": "-Xms1G -Xmx1G"
},
"job": {
"content": [{
"reader": {
"name": "mysqlreader",
"parameter": {
"fetchSize": -2147483648,
"isTableMode": false,
"jdbcUrl": "jdbc:mysql://localhost:3306/datax?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true",
"loadBalanceResourceMark": "localhost",
"password": "root123",
"querySql": "select id,username,telephone from user;",
"tableNumber": 0,
"username": "root"
}
},
"taskId": 0,
"writer": {
"name": "streamwriter",
"parameter": {
"encoding": "UTF-8",
"print": true
}
}
}, {
"reader": {
"name": "mysqlreader",
"parameter": {
"fetchSize": -2147483648,
"isTableMode": false,
"jdbcUrl": "jdbc:mysql://localhost:3306/datax?yearIsDateType=false&zeroDateTimeBehavior=convertToNull&tinyInt1isBit=false&rewriteBatchedStatements=true",
"loadBalanceResourceMark": "localhost",
"password": "root123",
"querySql": "select id,username,telephone from user_copy;",
"tableNumber": 0,
"username": "root"
}
},
"taskId": 1,
"writer": {
"name": "streamwriter",
"parameter": {
"$ref": "$.job.content[0].writer.parameter"
}
}
}],
"setting": {
"speed": {
"channel": 1
}
}
},
"plugin": {
"reader": {
"mysqlreader": {
"class": "com.alibaba.datax.plugin.reader.mysqlreader.MysqlReader",
"description": "useScene: prod. mechanism: Jdbc connection using the database, execute select sql, retrieve data from the ResultSet. warn: The more you know about the database, the less problems you encounter.",
"developer": "alibaba",
"name": "mysqlreader",
"path": "D:\\app\\workspace\\idea\\DataX\\target\\datax\\datax\\plugin\\reader\\mysqlreader"
}
},
"writer": {
"streamwriter": {
"class": "com.alibaba.datax.plugin.writer.streamwriter.StreamWriter",
"description": {
"mechanism": "use datax framework to transport data to stream.",
"useScene": "only for developer test.",
"warn": "Never use it in your real job."
},
"developer": "alibaba",
"name": "streamwriter",
"path": "D:\\app\\workspace\\idea\\DataX\\target\\datax\\datax\\plugin\\writer\\streamwriter"
}
}
}
}
- 任务调度this.schedule();
1- schedule首先完成的工作是把上一步reader和writer split的结果整合到具体taskGroupContainer中,同时不同的执行模式调用不同的调度策略,将所有任务调度起来
2- 通过获取配置信息得到每个taskGroup需要运行哪些tasks任务(公平的分配 task 到对应的 taskGroup 中。具体分配任务在此方法中)
List<Configuration> taskGroupConfigs = JobAssignUtil.assignFairly(this.configuration,
this.needChannelNumber, channelsPerTaskGroup);
获取taskGroupNumber
int taskGroupNumber = (int) Math.ceil(1.0 * channelNumber / channelsPerTaskGroup);// 返回大于或者等于指定表达式的最小整数,即向上取整
List<Configuration> taskGroupConfig = doAssign(resourceMarkAndTaskIdMap, configuration, taskGroupNumber);
会产生 集合类型的完整信息的json对象
计算所有 taskGroupConfig种 每个包含的task的个数的总和
int totalTasks = calculateTaskCount(configurations);
初始化线程池 跟据 taskGroup的个数
this.taskGroupContainerExecutorService = Executors
.newFixedThreadPool(configurations.size());
执行:
for (Configuration taskGroupConfiguration : configurations) {
TaskGroupContainerRunner taskGroupContainerRunner = newTaskGroupContainerRunner(taskGroupConfiguration);
this.taskGroupContainerExecutorService.execute(taskGroupContainerRunner);
}
最终执行run:方法
this.taskGroupContainer.start();
具体逻辑TaskGroupContainer类的start方法
获取任务相关的参数,比如重试次数,重试间隔,任务信息汇报间隔等
获取具体task的信息
List<Configuration> taskConfigs = this.configuration
.getListConfiguration(CoreConstant.DATAX_JOB_CONTENT);
初始化任务执行前的信息:
Map<Integer, Configuration> taskConfigMap = buildTaskConfigMap(taskConfigs); //taskId与task配置
List<Configuration> taskQueue = buildRemainTasks(taskConfigs); //待运行task列表
Map<Integer, TaskExecutor> taskFailedExecutorMap = new HashMap<Integer, TaskExecutor>(); //taskId与上次失败实例
List<TaskExecutor> runTasks = new ArrayList<TaskExecutor>(channelNumber); //正在运行task
Map<Integer, Long> taskStartTimeMap = new HashMap<Integer, Long>(); //任务开始时间
代码注释:
{
//1.判断task状态
boolean failedOrKilled = false;
Map<Integer, Communication> communicationMap = containerCommunicator.getCommunicationMap();
for(Map.Entry<Integer, Communication> entry : communicationMap.entrySet()){//遍历获取所有的task
Integer taskId = entry.getKey();
Communication taskCommunication = entry.getValue();
if(!taskCommunication.isFinished()){//判断任务是否已经完成
continue;
}
TaskExecutor taskExecutor = removeTask(runTasks, taskId);//如果已经完成的任务,从正在运行的任务里面移除
//上面从runTasks里移除了,因此对应在monitor里移除
taskMonitor.removeTask(taskId);
//失败,看task是否支持failover,重试次数未超过最大限制 重新加入任务列表
if(taskCommunication.getState() == State.FAILED){
taskFailedExecutorMap.put(taskId, taskExecutor);
if(taskExecutor.supportFailOver() && taskExecutor.getAttemptCount() < taskMaxRetryTimes){
taskExecutor.shutdown(); //关闭老的executor
containerCommunicator.resetCommunication(taskId); //将task的状态重置
Configuration taskConfig = taskConfigMap.get(taskId);
taskQueue.add(taskConfig); //重新加入任务列表
}else{
failedOrKilled = true;
break;
}
}else if(taskCommunication.getState() == State.KILLED){
failedOrKilled = true;
break;
}else if(taskCommunication.getState() == State.SUCCEEDED){
Long taskStartTime = taskStartTimeMap.get(taskId);
if(taskStartTime != null){
Long usedTime = System.currentTimeMillis() - taskStartTime;
LOG.info("taskGroup[{}] taskId[{}] is successed, used[{}]ms",
this.taskGroupId, taskId, usedTime);
//usedTime*1000*1000 转换成PerfRecord记录的ns,这里主要是简单登记,进行最长任务的打印。因此增加特定静态方法
PerfRecord.addPerfRecord(taskGroupId, taskId, PerfRecord.PHASE.TASK_TOTAL,taskStartTime, usedTime * 1000L * 1000L);
taskStartTimeMap.remove(taskId);
taskConfigMap.remove(taskId);
}
}
}
// 2.发现该taskGroup下taskExecutor的总状态失败则汇报错误
if (failedOrKilled) {
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
throw DataXException.asDataXException(
FrameworkErrorCode.PLUGIN_RUNTIME_ERROR, lastTaskGroupContainerCommunication.getThrowable());
}
//3.有任务未执行,且正在运行的任务数小于最大通道限制
Iterator<Configuration> iterator = taskQueue.iterator();
while(iterator.hasNext() && runTasks.size() < channelNumber){
Configuration taskConfig = iterator.next();
Integer taskId = taskConfig.getInt(CoreConstant.TASK_ID);
int attemptCount = 1;
TaskExecutor lastExecutor = taskFailedExecutorMap.get(taskId);
if(lastExecutor!=null){
attemptCount = lastExecutor.getAttemptCount() + 1;
long now = System.currentTimeMillis();
long failedTime = lastExecutor.getTimeStamp();
if(now - failedTime < taskRetryIntervalInMsec){ //未到等待时间,继续留在队列
continue;
}
if(!lastExecutor.isShutdown()){ //上次失败的task仍未结束
if(now - failedTime > taskMaxWaitInMsec){
markCommunicationFailed(taskId);
reportTaskGroupCommunication(lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
throw DataXException.asDataXException(CommonErrorCode.WAIT_TIME_EXCEED, "task failover等待超时");
}else{
lastExecutor.shutdown(); //再次尝试关闭
continue;
}
}else{
LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] has already shutdown",
this.taskGroupId, taskId, lastExecutor.getAttemptCount());
}
}
Configuration taskConfigForRun = taskMaxRetryTimes > 1 ? taskConfig.clone() : taskConfig;
TaskExecutor taskExecutor = new TaskExecutor(taskConfigForRun, attemptCount);//此处会初始化 readRunner和writerRunner
taskStartTimeMap.put(taskId, System.currentTimeMillis());
taskExecutor.doStart();//真正启动读取写入任务的地方
iterator.remove();//从等待运行移除
runTasks.add(taskExecutor);//加入到正在运行
//上面,增加task到runTasks列表,因此在monitor里注册。
taskMonitor.registerTask(taskId, this.containerCommunicator.getCommunication(taskId));
taskFailedExecutorMap.remove(taskId);
LOG.info("taskGroup[{}] taskId[{}] attemptCount[{}] is started",
this.taskGroupId, taskId, attemptCount);
}
//4.任务列表为空,executor已结束, 搜集状态为success--->成功
if (taskQueue.isEmpty() && isAllTaskDone(runTasks) && containerCommunicator.collectState() == State.SUCCEEDED) {
// 成功的情况下,也需要汇报一次。否则在任务结束非常快的情况下,采集的信息将会不准确
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
LOG.info("taskGroup[{}] completed it's tasks.", this.taskGroupId);
break;
}
// 5.如果当前时间已经超出汇报时间的interval,那么我们需要马上汇报
long now = System.currentTimeMillis();
if (now - lastReportTimeStamp > reportIntervalInMillSec) {
lastTaskGroupContainerCommunication = reportTaskGroupCommunication(
lastTaskGroupContainerCommunication, taskCountInThisTaskGroup);
lastReportTimeStamp = now;
//taskMonitor对于正在运行的task,每reportIntervalInMillSec进行检查
for(TaskExecutor taskExecutor:runTasks){
taskMonitor.report(taskExecutor.getTaskId(),this.containerCommunicator.getCommunication(taskExecutor.getTaskId()));
}
}
Thread.sleep(sleepIntervalInMillSec);
}
任务Channel的实现
基于内存的队列 reader读取数据到队列,writer从此队列获取数据到目标地址
MemoryChannel 内存Channel的具体实现,底层其实是一个ArrayBlockingQueue
基于ReentrantLock,Condition 实现消费者和生产者队列
ReentrantLock lock;
Condition notInsufficient, notEmpty;//不充足的,不为空
具体构造传输的对象DefaultRecord 读取的数据统一转为DefaultRecord类加入到队列
通过this.channel.pushAll(this.buffer); 加入到ArrayBlockQueue,等待消费者消费
主要关注的类:BufferedRecordExchanger,和MemoryChannel
BufferedRecordExchanger中实现了写入和读取的逻辑:
写入逻辑,会做限制行数和大小的限制:
public void sendToWriter(Record record) {
if(shutdown){
throw DataXException.asDataXException(CommonErrorCode.SHUT_DOWN_TASK, "");
}
Validate.notNull(record, "record不能为空.");
if (record.getMemorySize() > this.byteCapacity) {
this.pluginCollector.collectDirtyRecord(record, new Exception(String.format("单条记录超过大小限制,当前限制为:%s", this.byteCapacity)));
return;
}
//bufferSize默认是32 如果大于32或者超过最大限制 则提交一个批次
boolean isFull = (this.bufferIndex >= this.bufferSize || this.memoryBytes.get() + record.getMemorySize() > this.byteCapacity);
if (isFull) {
flush();
}
this.buffer.add(record);
this.bufferIndex++;
memoryBytes.addAndGet(record.getMemorySize());
}
MemoryChannel 中则是具体加入队列的地方,生产者和消费者的模型
总结
DataX3.0 的架构是框架+插件,根据配置文件加载到具体的插件信息,
然后执行一系列的方法。
主要关注类:LoadUtil中的几个方法loadPluginRunner,loadTaskPlugin
下次继续分享
点击查看更多内容
8人点赞
评论
共同学习,写下你的评论
评论加载中...
作者其他优质文章
正在加载中
感谢您的支持,我会继续努力的~
扫码打赏,你说多少就多少
赞赏金额会直接到老师账户
支付方式
打开微信扫一扫,即可进行扫码打赏哦