I’m trying to use docker compose to build a hadoop cluster on my macbook(m1, apple silicon chip). I have expose the port of 9000
on docker container, here is my java code below:
package cn.dioxide;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
public class HadoopTest {
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://localhost:9000");
conf.set("dfs.client.use.datanode.hostname", "true");
FileSystem fs = FileSystem.get(conf);
Path dirPath = new Path("/user/test");
if (fs.exists(dirPath)) {
fs.delete(dirPath, true);
}
fs.mkdirs(dirPath);
Path filePath = new Path("/user/test/test.txt");
BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fs.create(filePath, true)));
writer.write("Hello Hadoop!");
writer.close();
System.out.println("content: ");
BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(filePath)));
String line;
while ((line = reader.readLine()) != null) {
System.out.println(line);
}
reader.close();
fs.delete(filePath, true);
System.out.println("file has been deleted");
fs.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
then the errors below occurred:
org.apache.hadoop.ipc.RemoteException(java.io.IOException): File /user/test/test.txt could only be written to 0 of the 1 minReplication nodes. There are 3 datanode(s) running and 3 node(s) are excluded in this operation.
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:2350)
at org.apache.hadoop.hdfs.server.namenode.FSDirWriteFileOp.chooseTargetForNewBlock(FSDirWriteFileOp.java:294)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2989)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:912)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:595)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:621)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:589)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1094)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1017)
at java.base/java.security.AccessController.doPrivileged(Native Method)
at java.base/javax.security.auth.Subject.doAs(Subject.java:423)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3048)
at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1573)
at org.apache.hadoop.ipc.Client.call(Client.java:1519)
at org.apache.hadoop.ipc.Client.call(Client.java:1416)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:242)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:129)
at jdk.proxy2/jdk.proxy2.$Proxy15.addBlock(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:530)
at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
at java.base/java.lang.reflect.Method.invoke(Method.java:580)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:422)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:165)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:157)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:359)
at jdk.proxy2/jdk.proxy2.$Proxy16.addBlock(Unknown Source)
at org.apache.hadoop.hdfs.DFSOutputStream.addBlock(DFSOutputStream.java:1084)
at org.apache.hadoop.hdfs.DataStreamer.locateFollowingBlock(DataStreamer.java:1898)
at org.apache.hadoop.hdfs.DataStreamer.nextBlockOutputStream(DataStreamer.java:1700)
at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:707)
I build a master image and a slave image with Dockerfile below:
Dockerfile-master and Dockerfile-slave
# Dockerfile-hadoop
FROM ubuntu:14.04
RUN apt-get update && apt-get install -y openssh-server wget iptables
COPY resources/jdk-11.0.23 /opt/jdk-11.0.23
ENV JAVA_HOME=/opt/jdk-11.0.23
ENV PATH=$JAVA_HOME/bin:$PATH
COPY resources/hadoop-3.3.6 /opt/hadoop-3.3.6
RUN mv /opt/hadoop-3.3.6 /opt/hadoop
RUN echo "export JAVA_HOME=$JAVA_HOME" >> /opt/hadoop/etc/hadoop/hadoop-env.sh
ENV HADOOP_HOME=/opt/hadoop
ENV PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
RUN ssh-keygen -t rsa -P '' -f /root/.ssh/id_rsa
&& cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys
&& chmod 0600 /root/.ssh/authorized_keys
&& echo "Host *n StrictHostKeyChecking no" >> /etc/ssh/ssh_config
COPY config/core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
COPY config/hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
COPY config/yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
COPY config/mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
# if this file is master
COPY entrypoint/entrypoint-master.sh /entrypoint.sh
# if this file is slave
COPY entrypoint/entrypoint-slave.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
ENTRYPOINT ["/entrypoint.sh"]
and my entrypoint-master.sh shell script is below:
#!/bin/bash
iptables -F
service ssh start
echo "export PDSH_RCMD_TYPE=ssh" >> /etc/profile
source /etc/profile
export HADOOP_HOME=/opt/hadoop
export PATH=$HADOOP_HOME/bin:$PATH
if [ ! -d /hadoop/dfs/name ]; then
hdfs namenode -format -force
fi
$HADOOP_HOME/sbin/start-dfs.sh
$HADOOP_HOME/sbin/start-yarn.sh
$HADOOP_HOME/sbin/hadoop-daemon.sh start namenode
$HADOOP_HOME/sbin/hadoop-daemon.sh start secondarynamenode
$HADOOP_HOME/sbin/yarn-daemon.sh start resourcemanager
$HADOOP_HOME/sbin/yarn-daemon.sh start nodemanager
$HADOOP_HOME/sbin/hadoop-daemon.sh start datanode
hdfs dfs -chmod 777 /
tail -f /dev/null
entrypoint-slave.sh:
#!/bin/bash
iptables -F
service ssh start
echo "export PDSH_RCMD_TYPE=ssh" >> /etc/profile
source /etc/profile
export HADOOP_HOME=/opt/hadoop
export PATH=$HADOOP_HOME/bin:$PATH
if [ ! -d /hadoop/dfs/name ]; then
hdfs namenode -format -force
fi
$HADOOP_HOME/sbin/hadoop-daemon.sh start datanode
$HADOOP_HOME/sbin/yarn-daemon.sh start nodemanager
tail -f /dev/null
And my hadoop yml configuration file is below:
core-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop-master:9000/</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131072</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/hadoop/tmp</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
</configuration>
hdfs-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/root/hdfs/namenode</value>
<description>NameNode directory for namespace and transaction logs storage.</description>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/root/hdfs/datanode</value>
<description>DataNode directory</description>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.permissions.superusergroup</name>
<value>supergroup</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
</configuration>
mapred-site.xml
<?xml version="1.0"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.web.address</name>
<value>hadoop-master:19888</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>$HADOOP_HOME/share/hadoop/mapreduce/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
yarn-site.xml
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop-slave2</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>hadoop-slave2:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>hadoop-slave2:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>hadoop-slave2:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>hadoop-slave2:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>hadoop-slave2:8088</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.nodemanager.env-whitelist</name>
<value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_HOME</value>
</property>
</configuration>
New contributor