一、SparkContext API
1、读取hdfs数据转成numpy
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkContext,SparkConf
import numpy as np
import pickle
dirPath='hdfs://xxx/user/root/data_16/11/labels/part-00199' # 注该数据为pickle格式
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
# textFiles=sc.textFile(dirPath)
textFiles=sc.pickleFile(dirPath)
data=textFiles.collect()
# print(data[:5])
print(type(data)) # <type 'list'>
print(data[0].dtype) # float16
data=np.array(data,np.float32) # 转成array
np.save('123.npy',data) # 保存数据到本地
np.load('123.npy') # 加载数据
2、wholeTextFiles 读取目录下的所有数据(本地 或 hdfs)
- wholeTextFiles(path,minPartitions=None,use_unicode=True)
For example, if you have the following files:
hdfs://a-hdfs-path/part-00000
hdfs://a-hdfs-path/part-00001
...
hdfs://a-hdfs-path/part-nnnnn
Dordd = sparkContext.wholeTextFiles(“hdfs://a-hdfs-path”), thenrddcontains:
(a-hdfs-path/part-00000, its content)
(a-hdfs-path/part-00001, its content)
...
(a-hdfs-path/part-nnnnn, its content)
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkContext,SparkConf
import os
# from pyspark.context import SparkContext
# from pyspark.conf import SparkConf
#from pyspark.sql import DataFrame,SQLContext
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
dirPath = os.path.join('./', "files") # dirPath 也可以是hdfs上的文件
os.mkdir(dirPath)
with open(os.path.join(dirPath, "1.txt"), "w") as file1:
file1.write("10")
with open(os.path.join(dirPath, "2.txt"), "w") as file2:
file2.write("20")
textFiles = sc.wholeTextFiles(dirPath)
# sorted(textFiles.collect())
print(type(textFiles)) # <class 'pyspark.rdd.RDD'>
print(textFiles.collect())
print(type(textFiles.collect())) # list
# [(u'.../1.txt', u'10'), (u'.../2.txt', u'20')]
print(len(textFiles.collect())) # 2
3、addFile(path)
Add a file to be downloaded with this Spark job on every node. Thepathpassed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
To access the file in Spark jobs, useSparkFiles.get(fileName)with the filename to find its download location.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkFiles
from pyspark import SparkContext, SparkConf
import os
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
path = os.path.join('./', "test.txt") # 也可以说hdfs路径
print path
with open(path, "w") as testFile:
testFile.write("100")
sc.addFile(path) # Add a file to be downloaded with this Spark job on every node
def func(iterator):
with open(SparkFiles.get("test.txt")) as testFile:
fileva l = int(testFile.readline())
return [float(x) * fileva l for x in iterator]
result=sc.parallelize([1, 2, 3, 4]).mapPartitions(func)#.collect()) # [100, 200, 300, 400]
print result.count()
print result.collect()
执行:spark-submit test2.py,输出如下:
./test.txt
4
[100.0, 200.0, 300.0, 400.0]
4、addPyFile(path)
Add a .py or .zip dependency forall tasks to be executed on this SparkContext in the future. Thepathpassed can be either a local file, a file in HDFS (or other Hadoop-supported filesystems), or an HTTP, HTTPS or FTP URI.
import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkContext,SparkConf
from pyspark import SparkFiles
import pyspark_csv as pycsv
import os
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
sc.addPyFile('pyspark_csv.py')
# print(SparkFiles.get("pyspark_csv.py")) # 返回文件的绝对路径
os.popen("python "+SparkFiles.get("pyspark_csv.py")) # 执行脚本
5、binaryFiles(path,minPartitions=None)
:: ExperimentalRead a directory of binary files from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI as a byte array. Each file is read as a single record and returned in a key-value pair, where the key is the path of each file, the value is the content of each file.
Note: Small files are preferred, large file is also allowable, but may cause bad performance.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkFiles
from pyspark import SparkContext,SparkConf
import os
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
dirPath='hdfs://xxx/user/root/data_16/11/labels/part-00199'
data=sc.binaryFiles(dirPath) # Read a directory of binary files from HDFS
print(data) # org.apache.spark.api.java.JavaPairRDD@27a22ddc
6、clearFiles()
Clear the job’s list of files added byaddFileoraddPyFileso that they do not get downloaded to any new nodes.
二、RDD API
1、保存文件
saveAsPickleFile
saveAsPickleFile(path,batchSize=10)>>> tmpFile = NamedTemporaryFile(delete=True)
>>> tmpFile.close()
>>> sc.parallelize([1, 2, 'spark', 'rdd']).saveAsPickleFile(tmpFile.name, 3)
>>> sorted(sc.pickleFile(tmpFile.name, 5).collect())
[1, 2, 'rdd', 'spark']
saveAsTextFile
Save this RDD as a Sequence File of serialized objects.
The serializer used ispyspark.serializers.PickleSerializer, default batch size is 10.
saveAsTextFile(path)Save this RDD as a text file, using string representations of elements.
>>> tempFile = NamedTemporaryFile(delete=True)
>>> tempFile.close()
>>> sc.parallelize(range(10)).saveAsTextFile(tempFile.name)
>>> from fileinput import input
>>> from glob import glob
>>> ''.join(sorted(input(glob(tempFile.name + "/part-0000*"))))
'0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n'
Empty lines are tolerated when saving to text files.
>>> tempFile2 = NamedTemporaryFile(delete=True)
>>> tempFile2.close()
>>> sc.parallelize(['', 'foo', '', 'bar', '']).saveAsTextFile(tempFile2.name)
>>> ''.join(sorted(input(glob(tempFile2.name + "/part-0000*"))))
'\n\n\nbar\nfoo\n'
三、SparkFiles
Resolves paths to files added throughSparkContext.addFile()
classmethodget(filename) # 获取文件的绝对路径
Get the absolute path of a file added throughSparkContext.addFile().
classmethodgetRootDirectory()Get the root directory that contains files added throughSparkContext.addFile().
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkFiles
from pyspark import SparkContext,SparkConf
import os
sc = SparkContext(conf=SparkConf().setAppName("The first example"))
path = os.path.join('./', "test.txt") # 也可以说hdfs路径
with open(path, "w") as testFile:
testFile.write("100")
sc.addFile(path) # Add a file to be downloaded with this Spark job on every node
def func(iterator):
with open(SparkFiles.get("test.txt")) as testFile: # SparkFiles.get(path)
fileva l = int(testFile.readline())
return [x * fileva l for x in iterator]
print(sc.parallelize([1, 2, 3, 4]).mapPartitions(func).collect())
# [100, 200, 300, 400]
四、DataFrameReader
csv
>>> df = spark.read.csv('python/test_support/sql/ages.csv')
>>> df.dtypes
[('_c0', 'string'), ('_c1', 'string')]
format(source)
Specifies the input data source format.
Parameters: | source– string, name of the data source, e.g. ‘json’, ‘parquet’. |
---|
>>> df = spark.read.format('json').load('python/test_support/sql/people.json')
>>> df.dtypes
[('age', 'bigint'), ('name', 'string')]
json
Loads JSON files and returns the results as aDataFrame.
>>> df1 = spark.read.json('python/test_support/sql/people.json')
>>> df1.dtypes
[('age', 'bigint'), ('name', 'string')]
>>> rdd = sc.textFile('python/test_support/sql/people.json')
>>> df2 = spark.read.json(rdd)
>>> df2.dtypes
[('age', 'bigint'), ('name', 'string')]
load(path=None,format=None,schema=None,**options)
>>> df = spark.read.load('python/test_support/sql/parquet_partitioned', opt1=True,
... opt2=1, opt3='str')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
>>> df = spark.read.format('json').load(['python/test_support/sql/people.json',
... 'python/test_support/sql/people1.json'])
>>> df.dtypes
[('age', 'bigint'), ('aka', 'string'), ('name', 'string')]
orc(path)
Loads ORC files, returning the result as aDataFrame.Note
Currently ORC support is only available together with Hive support.
>>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
>>> df.dtypes
[('a', 'bigint'), ('b', 'int'), ('c', 'int')]
parquet(*paths)
Loads Parquet files, returning the result as aDataFrame.
>>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
>>> df.dtypes
[('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
text(paths)
>>> df = spark.read.text('python/test_support/sql/text-test.txt')
>>> df.collect()
[Row(value=u'hello'), Row(value=u'this')]
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from pyspark import SparkFiles
from pyspark import SparkContext,SparkConf
from pyspark.sql import DataFrame,SQLContext,DataFrameReader
import os
from pyspark.sql import SparkSession
# sc = SparkContext(conf=SparkConf().setAppName("The first example"))
path = os.path.join('./', "dna_seq.txt") # 也可以是hdfs路径
spark = SparkSession.builder \
.master("local") \
.appName("Word Count") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
df=spark.read.text(path)
# spark.read.json("hdfs://localhost:9000/testdata/person.json")
# spark.read.csv()
print(type(df)) # <class 'pyspark.sql.dataframe.DataFrame'>
五、DataFrameWriter
UseDataFrame.write()to access this.
csv
>>> df.write.csv(os.path.join(tempfile.mkdtemp(), 'data'))
format(source)
Specifies the underlying output data source.Parameters: | source– string, name of the data source, e.g. ‘json’, ‘parquet’. |
---|
>>> df.write.format('json').save(os.path.join(tempfile.mkdtemp(), 'data'))
json
>>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data'))
mode(saveMode)
Options include:
- append: Append contents of thisDataFrameto existing data.
- overwrite: Overwrite existing data.
- error: Throw an exception if data already exists.
- ignore: Silently ignore this operation if data already exists.
>>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
save(path=None, format=None, mode=None, partitionBy=None, **options)
>>> df.write.mode('append').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
text(path,compression=None)
Saves the content of the DataFrame in a text file at the specified path
Specifies the behavior when data or table already exists.
六、DataStreamReader
Usespark.readStream()to access this
csv
>>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)
>>> csv_sdf.isStreaming
True
>>> csv_sdf.schema == sdf_schema
True
Specifies the input data source format.
Parameters: | source– string, name of the data source, e.g. ‘json’, ‘parquet’. |
---|
>>> s = spark.readStream.format("text")
json
>>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema)
>>> json_sdf.isStreaming
True
>>> json_sdf.schema == sdf_schema
True
load(path=None, format=None, schema=None, **options)
>>> json_sdf = spark.readStream.format("json") \
... .schema(sdf_schema) \
... .load(tempfile.mkdtemp())
>>> json_sdf.isStreaming
True
>>> json_sdf.schema == sdf_schema
True
text(path)
>>> text_sdf = spark.readStream.text(tempfile.mkdtemp())
>>> text_sdf.isStreaming
True
>>> "value" in str(text_sdf.schema)
True
七、附加 hadoop 文件操作命令
hdfs dfs -ls # 显示目录
hdfs dfs -ls xxx/|wc -l # 显示xxx目录下的文件和文件夹个数
hdfs dfs -mkdir xxx # 新建目录
hdfs dfs -rm -r xxx # 删除文件或目录
hdfs dfs -put xxx data # 将xxx 上传到 hdfs的data目录
hdfs dfs -get xxx ./ # 将hdfs的xxx(文件或文件夹)复制到本地
yarn application -kill application_1502181070712_0574 # 杀掉进程
spark-submit test.py # 执行脚本 test.py
参考: https://blog.csdn.net/wc781708249/article/details/78251701