设置 Hadoop 认证

1hdfs dfs -Dhadoop.job.ugi=user_name:password -ls hdfs://xxxx/dataindexing/xxx/wuid_merged

批量删除 HDFS 文件

1 hadoop fs -ls -R hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028  | awk '{ if ($5 >= '2048000000' ) print $8 }'|xargs echo
2
3hadoop fs -ls -R hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028  | awk '{ if ($5 >= '5048000000' ) print $8 }'| wc -l

删除文件夹

1hdfs dfs -rm -r /path/to/folder

查询 HDFS 文件大小并排序

 1hadoop fs -du -h hdfs://qy-pcg-8-v3/stage/outface/CSIG/dddd/group/temp/11111006/20231028 | sort -h 
 2# 查询某目录下的文件总大小
 3hadoop fs -du -s -h hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail
 4
 5# 查询实际存储占用大小,不包括副本
 6hadoop fs -ls -R hdfs://xxxx/data/SPARK/CDG/dddd/track_log/non_major_user/detail/20241020 | awk '{sum += $5} END {print sum}'
 7
 8#查询文件的副本数量
 9hadoop fs -ls -stat hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail/20241020/2024102008/compacted-part-bf2d3680-7cbf-4497-9808-6924865a8ffc-0.recordio
10
11#查询某目录下的文件数量大小
12hadoop fs -ls -R hdfs://ss-cdg-13-v2/data/SPARK/CDG/dddd/track_log/non_major_user/detail | grep ^- | wc -l

自定义HDFS文件名称

 1class CustomOutputFormat extends MultipleTextOutputFormat[Any, Any] {
 2
 3  override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = {
 4    //这里的key和value指的就是要写入文件的rdd对,再此,我定义文件名以key.txt来命名,当然也可以根据其他的需求来进行生成文件名
 5    val fileName = key.asInstanceOf[String] + ".check"
 6    fileName
 7  }
 8
 9  override def generateActualKey(key: Any, value: Any): String = {
10    null
11  }
12
13  override def checkOutputSpecs(ignored: FileSystem, job: JobConf): Unit = {
14    val outDir: Path = FileOutputFormat.getOutputPath(job)
15    if (outDir != null)
16      FileOutputFormat.setOutputPath(job, outDir)
17  }
18}
19
20val checkData = spark.sparkContext.parallelize(Seq(partitionTime))
21
22checkData.map(x=>(x,""))    .saveAsHadoopFile(checkFilePath,classOf[String],classOf[String],classOf[CustomOutputFormat])

追加文件内容

1hadoop fs -appendToFile /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt

覆盖文件内容

 1# 上传单个文件:
 2hadoop fs -put /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt
 3
 4#上传多个文件到目录
 5hadoop fs -put /home/user/file1.txt /home/user/file2.txt /user/hadoop/mydir/
 6
 7#强制覆盖已有文件
 8hadoop fs -put -f /root/total_gray_user_id.txt hdfs://xxxxx/data/data/user_id.txt
 9
10#上传目录
11hadoop fs -put /root/mylocaldir hdfs://xxxxx/data/data/