Connect & read Datalake from Python
A couple Azure libraries need to be installed first.
#pip install azure-storage-file-datalake
#pip install azure-identity
If using service principal:
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import ClientSecretCredential
tenant_id=""
app_id=""
secret=""
storage_account_name = ""
container_name = ''
account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)
credential = ClientSecretCredential(tenant_id, app_id, secret)
service = DataLakeServiceClient(account_url=account_url, credential=credential)
file_system_client = service.get_file_system_client(container_name)
for path in file_system_client.get_paths('/'):
print(path.name)
If using access key:
from azure.storage.filedatalake import DataLakeServiceClient
container_name = ''
account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)
access_key = ''
datalake_service_client = DataLakeServiceClient(account_url=account_url, credential=access_key)
file_systems = datalake_service_client.list_file_systems()
for file_system in file_systems:
print(file_system.name)
Note different service principal / access keys may have different access to the data lake. So some of the operations may not work, e.g. list file systems( i.e. containers) , but other ops, e.g. get paths for a specific container may work.
To download files from a subdirectory using access key
Connect to the file system and list all files under a directory. Then download each of the files into one file on the local. Go from there, e.g. load file to pandas, etc.
#https://docs.microsoft.com/en-us/azure/architecture/data-science-process/explore-data-blob
from azure.storage.filedatalake import DataLakeServiceClient
endpoint = 'https://mydatalake001.dfs.core.windows.net/'
access_key = 'xxxxxxx'
service_client = DataLakeServiceClient(account_url=endpoint, credential=access_key)
container = 'mycontianer'
site = 'xxx'
partition = '2022-07-26'
path=f"osipi/archive/{site}/csv/{partition}"
local_temp_file = 'c:\\temp\\download.csv'
try:
file_system_client = service_client.get_file_system_client(file_system=container )
directory_client = file_system_client.get_directory_client(path)
paths = file_system_client.get_paths(path) #get all files
with open(local_temp_file, 'wb') as local_file:
for path in paths:
#print('downloading {}'.format(path.name))
file_client = directory_client.get_file_client(path)
download = file_client.download_file()
downloaded_bytes = download.readall()
local_file.write(downloaded_bytes)
except Exception as e:
print(e)
To download files from subdirectory using service principal
#pip install azure-storage-file-datalake
#pip install azure-identity
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import ClientSecretCredential
import pandas as pd
tenant_id="xxx"
app_id="xxxe"
secret="xxx"
storage_account_name = "xxx"
account_url = 'https://{}.dfs.core.windows.net/'.format(storage_account_name)
credential = ClientSecretCredential(tenant_id, app_id, secret)
service = DataLakeServiceClient(account_url=account_url, credential=credential)
container_name = 'containername'
path = 'dir1/dir2'
local_temp_file = 'c:\\temp\\download.csv'
try:
file_system_client = service.get_file_system_client(container_name)
directory_client = file_system_client.get_directory_client(path)
paths = file_system_client.get_paths(path) #get all files
with open(local_temp_file, 'wb') as local_file:
for path in paths:
print('downloading {}'.format(path.name))
file_client = directory_client.get_file_client(path)
download = file_client.download_file()
downloaded_bytes = download.readall()
local_file.write(downloaded_bytes)
df = pd.read_csv(local_temp_file)
df.head()
except Exception as e:
print(e)