设置 Hadoop 认证
1hdfs dfs -Dhadoop.job.ugi=user_name:password -ls hdfs://xxxx/dataindexing/xxx/wuid_merged
批量删除 HDFS 文件
1 hadoop fs -ls -R hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028 | awk '{ if ($5 >= '2048000000' ) print $8 }'|xargs echo
2
3hadoop fs -ls -R hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028 | awk '{ if ($5 >= '5048000000' ) print $8 }'| wc -l
删除文件夹
1hdfs dfs -rm -r /path/to/folder
查询 HDFS 文件大小并排序
1hadoop fs -du -h hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028 | sort -h
2# 查询某目录下的文件总大小
3hadoop fs -du -s -h hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail
4
5# 查询实际存储占用大小,不包括副本
6hadoop fs -ls -R hdfs://xxxx/data/SPARK/CDG/dddd/track_log/non_major_user/detail/20241020 | awk '{sum += $5} END {print sum}'
7
8#查询文件的副本数量
9hadoop fs -ls -stat hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail/20241020/2024102008/compacted-part-bf2d3680-7cbf-4497-9808-6924865a8ffc-0.recordio
10
11#查询某目录下的文件数量大小
12hadoop fs -ls -R hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail | grep ^- | wc -l
自定义HDFS文件名称
1class CustomOutputFormat extends MultipleTextOutputFormat[Any, Any] {
2
3 override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
4 //这里的key和value指的就是要写入文件的rdd对,再此,我定义文件名以key.txt来命名,当然也可以根据其他的需求来进行生成文件名
5 val fileName = key.asInstanceOf[String] + ".check"
6 fileName
7 }
8
9 override def generateActualKey(key: Any, value: Any): String = {
10 null
11 }
12
13 override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = {
14 val outDir: Path = FileOutputFormat.getOutputPath(job)
15 if (outDir != null)
16 FileOutputFormat.setOutputPath(job, outDir)
17 }
18}
19
20val checkData = spark.sparkContext.parallelize(Seq(partitionTime))
21
22checkData.map(x=>(x,"")) .saveAsHadoopFile(checkFilePath,classOf[String],classOf[String],classOf[CustomOutputFormat])
追加文件内容
1hadoop fs -appendToFile /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt
覆盖文件内容
1# 上传单个文件:
2hadoop fs -put /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt
3
4#上传多个文件到目录
5hadoop fs -put /home/user/file1.txt /home/user/file2.txt /user/hadoop/mydir/
6
7#强制覆盖已有文件
8hadoop fs -put -f /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt
9
10#上传目录
11hadoop fs -put /root/mylocaldir hdfs://xxxxx/data/data/