Skip to content

Commit

Permalink
Merge pull request #25 from MarkPryceMaherMSFT/mpmdev
Browse files Browse the repository at this point in the history
updates and improvements
  • Loading branch information
AzureNick authored Jan 28, 2025
2 parents 570e83b + 70f1118 commit 194eb9a
Show file tree
Hide file tree
Showing 34 changed files with 1,718 additions and 0 deletions.
18 changes: 18 additions & 0 deletions data-warehouse/copywarehouse/CopyWarehouse.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.Identity" Version="1.13.1" />
<PackageReference Include="Azure.Storage.Files.DataLake" Version="12.21.0" />
<PackageReference Include="Microsoft.Graph" Version="5.67.0" />
<PackageReference Include="Microsoft.Identity.Client" Version="4.66.2" />
</ItemGroup>

</Project>
25 changes: 25 additions & 0 deletions data-warehouse/copywarehouse/CopyWarehouse.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 17
VisualStudioVersion = 17.11.35431.28
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CopyWarehouse", "CopyWarehouse.csproj", "{40F181A6-5B45-468B-903B-4E6B269DBCE2}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{40F181A6-5B45-468B-903B-4E6B269DBCE2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{40F181A6-5B45-468B-903B-4E6B269DBCE2}.Debug|Any CPU.Build.0 = Debug|Any CPU
{40F181A6-5B45-468B-903B-4E6B269DBCE2}.Release|Any CPU.ActiveCfg = Release|Any CPU
{40F181A6-5B45-468B-903B-4E6B269DBCE2}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {2B719D12-6E4D-470C-BA0D-4A45CE04DF0E}
EndGlobalSection
EndGlobal
119 changes: 119 additions & 0 deletions data-warehouse/copywarehouse/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
using Azure.Core;
using Azure.Identity;
using Azure.Storage.Files.DataLake;
using System;
using System.IO;
using System.Threading.Tasks;

class Program
{
static async Task Main(string[] args)
{
if (args.Length !=4)
{
Console.WriteLine("CopyWarehouse.exe");
Console.WriteLine("usage:");
Console.WriteLine("CopyWarehouse {source-workspace-id} {warehouse-id} {destination-workspace-id} {lakehouse-id}");
return;
}

// Source (OneLake)
string sourceAccountUrl = "https://onelake.dfs.fabric.microsoft.com/";
string sourceFileSystemName = $"{args[0]}";
string sourceFolderPath = $"/{args[1]}/Tables/";

// Destination (Fabric Lakehouse)
string destinationAccountUrl = "https://onelake.dfs.fabric.microsoft.com/";
string destinationFileSystemName = $"{args[2]}";
string destinationFolderPath = $"{args[3]}/Tables/";

Console.WriteLine($"Copy from {sourceFileSystemName} {sourceFolderPath} ");
Console.WriteLine($"Copy to {destinationFileSystemName} {destinationFolderPath} ");


// Authenticate using DefaultAzureCredential
var credential = new DefaultAzureCredential();

var dlco = new DataLakeClientOptions();
dlco.Retry.NetworkTimeout.Add(new TimeSpan(1,0,0));


// Create source and destination DataLake clients
var sourceDataLakeServiceClient = new DataLakeServiceClient(new Uri(sourceAccountUrl), credential, dlco);
var destinationDataLakeServiceClient = new DataLakeServiceClient(new Uri(destinationAccountUrl), credential, dlco);

// Get file system clients
var sourceFileSystemClient = sourceDataLakeServiceClient.GetFileSystemClient(sourceFileSystemName);
var destinationFileSystemClient = destinationDataLakeServiceClient.GetFileSystemClient(destinationFileSystemName);





// Copy folder contents
await CopyFolderAsync(sourceFileSystemClient, sourceFolderPath, destinationFileSystemClient, destinationFolderPath);

Console.WriteLine("Copy Complete.");
}

static async Task CopyFolderAsync(
DataLakeFileSystemClient sourceFileSystemClient,
string sourceFolderPath,
DataLakeFileSystemClient destinationFileSystemClient,
string destinationFolderPath)
{
// Get the source directory client
DataLakeDirectoryClient sourceDirectoryClient = sourceFileSystemClient.GetDirectoryClient(sourceFolderPath);

// Ensure destination folder exists
DataLakeDirectoryClient destinationDirectoryClient = destinationFileSystemClient.GetDirectoryClient(destinationFolderPath);
// await destinationDirectoryClient.CreateIfNotExistsAsync();
// List all paths in the source directory

double conentl = 0;

await foreach (var pathItem in sourceDirectoryClient.GetPathsAsync())
{
string relativePath = pathItem.Name.Substring(sourceFolderPath.TrimStart('/').Length).TrimStart('/');
string destinationPath = Path.Combine(destinationFolderPath, relativePath).Replace("\\", "/");

if (pathItem.IsDirectory == true)
{
// Recursively copy subdirectories
await CopyFolderAsync(sourceFileSystemClient, pathItem.Name, destinationFileSystemClient, destinationPath);
}
else
{
// Copy file
Console.WriteLine($"Copying file: {pathItem.Name} to {destinationPath}");
await CopyFileAsync(sourceFileSystemClient, pathItem.Name, destinationFileSystemClient, destinationPath);
conentl = conentl + Convert.ToDouble ( pathItem.ContentLength);
}
}
}

static async Task CopyFileAsync(
DataLakeFileSystemClient sourceFileSystemClient,
string sourceFilePath,
DataLakeFileSystemClient destinationFileSystemClient,
string destinationFilePath)
{
// Get source file client
var sourceFileClient = sourceFileSystemClient.GetFileClient(sourceFilePath);

// Get destination file client
var destinationFileClient = destinationFileSystemClient.GetFileClient(destinationFilePath);

if(destinationFileClient.Exists())
{
Console.WriteLine("File Exists.");
return;
}

// Read from source file
using var stream = await sourceFileClient.OpenReadAsync();

// Upload to destination file
await destinationFileClient.UploadAsync(stream, overwrite: true);
}
}
20 changes: 20 additions & 0 deletions data-warehouse/copywarehouse/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# CopyWarehouse
This tool can be used to copy or backup a Fabric Warehouse to a Lakehouse.

You can use it to scale out a warehouse and create read-only replicas, but coping the Warehouse to lakehouse in a different workspace.

## Usage
CopyWarehouse.exe {source-workspace-id} {warehouse-id} {destination-workspace-id} {lakehouse-id}

Use the guid's or ID's for the Fabric workspace, warehouse and lakehouses. (It does not work with name.)
The destination Fabric lakehouse must exist before running the binary.


## Instructions
1. Download the C# project from Github
1. Open the .csproj file in VSCode and Visual Studio 2022 (I used Community Edition.)
1. Compile the project.
1. Run the binary

## Improvements
This is an open source project, so feel free to submit bug fixes and improvements.
7 changes: 7 additions & 0 deletions data-warehouse/migration/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Migration
This section contains tools, scripts, code examples, and accelerators for migrating workloads to Fabric Data Warehouse.

## connections
Notebook to query the connections in Fabric.


3 changes: 3 additions & 0 deletions data-warehouse/migration/connections/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Connections

Notebook to show the connections used.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["# Install semantic-link-labs if its not already installed\n","#%pip install semantic-link-labs\n","import sempy\n","import sempy.fabric as fabric\n","import sempy_labs as labs\n","\n","# name of Gen2, or the database\n","Gen2Name = \"mpm\" # I am using find so a partial string would work, but this should be the servername/workspace name and the database.\n","\n","# list_connections\n","cons = labs.list_connections()\n","# display all the connections / just for debugging\n","#display(cons)\n","\n","# loop through all the connections, just highlight the ones that are using the dedicated sql pool\n","for index, row in cons.iterrows():\n"," #print (index,row[\"Connection Path\"])\n"," x=row[\"Connection Path\"];\n"," if (x.find(Gen2Name) > -1 and row[\"Connection Type\"]=='SQL'):\n"," print(\"Change this connection\", x)\n"," # TODO: Put code here to update the connection\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"},"collapsed":false},"id":"c16feef8-36e7-4e78-a41c-f9b46dd4bc49"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"[email protected]"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"synapse_widget":{"version":"0.1","state":{}},"dependencies":{"lakehouse":{"default_lakehouse":"","known_lakehouses":[{"id":""}],"default_lakehouse_name":"Backup_LH","default_lakehouse_workspace_id":""}}},"nbformat":4,"nbformat_minor":5}
9 changes: 9 additions & 0 deletions data-warehouse/notebooks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# create-list-delete-warehouse

A notebook that shows how to create, list and delete a Fabric warehouse using the public REST API.

# workspace size

A notebook that shows the size of a folder on Onelake, it can be used to show the size of Workspace, Warehouse or just a table.

Change the path, as per example to the appropriate folder.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["# details from Article : https://learn.microsoft.com/en-us/fabric/data-warehouse/collation\n","# default collation is Latin1_General_100_BIN2_UTF8\n","# new collation is Latin1_General_100_CI_AS_KS_WS_SC_UTF8\n","\n","#REST API : https://learn.microsoft.com/en-us/rest/api/fabric/warehouse/items/create-warehouse?tabs=HTTP\n","\n","#sempy version 0.4.0 or higher\n","!pip install semantic-link --q \n","import json\n","import sempy.fabric as fabric\n","from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundException\n","\n","workspace_id=spark.conf.get(\"trident.workspace.id\")\n","\n","#Instantiate the client\n","client = fabric.FabricRestClient()\n","\n","uri = f\"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items\"\n","payload = { \n"," \"type\": \"Warehouse\", \n"," \"displayName\": \"marktest\", \n"," \"description\": \"New warehouse with case-insensitive collation\", \n"," \"creationPayload\": { \n"," \"defaultCollation\": \"Latin1_General_100_CI_AS_KS_WS_SC_UTF8\" \n"," } \n","}\n","\n","# Call the REST API\n","response = client.post(uri,json= payload)\n","display(response)\n","\n","#data = json.loads(response.text)\n","#display(data)"],"outputs":[{"output_type":"display_data","data":{"application/vnd.livy.statement-meta+json":{"spark_pool":null,"statement_id":74,"statement_ids":[74],"state":"finished","livy_statement_state":"available","session_id":"65aa4f49-978d-45fd-b906-01addd1f27e9","normalized_state":"finished","queued_time":"2024-11-30T12:09:02.1534597Z","session_start_time":null,"execution_start_time":"2024-11-30T12:09:06.9337526Z","execution_finish_time":"2024-11-30T12:09:11.7107053Z","parent_msg_id":"0f55a180-bb20-4af1-ba70-00fa37c019a0"},"text/plain":"StatementMeta(, 65aa4f49-978d-45fd-b906-01addd1f27e9, 74, Finished, Available, Finished)"},"metadata":{}},{"output_type":"display_data","data":{"text/plain":"<Response [202]>"},"metadata":{}}],"execution_count":72,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"64c0e7fb-eff6-4544-aa5f-2b80354243ae"},{"cell_type":"code","source":["# RESPI API : https://learn.microsoft.com/en-us/rest/api/fabric/warehouse/items/list-warehouses?tabs=HTTP\n","\n","import json\n","import sempy.fabric as fabric\n","from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundException\n","import time\n","target_displayname = 'marktest'\n","\n","workspace_id=spark.conf.get(\"trident.workspace.id\")\n","\n","#Instantiate the client\n","client = fabric.FabricRestClient()\n","\n","statusuri = f\"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/warehouses\"\n","matching_id = None\n","\n","while(matching_id is None):\n"," statusresponsedata = client.get(statusuri).json()\n"," datar = statusresponsedata['value']\n"," for item in datar:\n"," whName = item['displayName']\n"," if whName == target_displayname:\n"," matching_id = item['id']\n"," break\n"," \n"," display(\"Waiting....\")\n"," time.sleep(1)\n","\n","display(f\"Warehouse id is {matching_id}\")\n","display(\"Warehouse details:\")\n","print(item)"],"outputs":[],"execution_count":null,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"},"collapsed":false},"id":"41e98e7c-42fc-4521-b7e5-6db5a1656143"},{"cell_type":"code","source":["# RESPI API : https://learn.microsoft.com/en-us/rest/api/fabric/warehouse/items/get-warehouse?tabs=HTTP\n","\n","import json\n","import sempy.fabric as fabric\n","from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundException\n","\n","workspace_id=spark.conf.get(\"trident.workspace.id\")\n","\n","#Instantiate the client\n","client = fabric.FabricRestClient()\n","\n","#matchind_id = 'bd3bb97e-8255-4b33-8ac2-8f63ec53fd23' \n","\n","statusuri = f\"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/warehouses/{matching_id}\"\n","\n","statusresponsedata = client.get(statusuri).json()\n","display(statusresponsedata)\n","\n"],"outputs":[],"execution_count":null,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"49901dcb-bdd1-4036-920c-42bccf023638"},{"cell_type":"code","source":["# REST API: https://learn.microsoft.com/en-us/rest/api/fabric/warehouse/items/delete-warehouse?tabs=HTTP\n","\n","import json\n","import sempy.fabric as fabric\n","from sempy.fabric.exceptions import FabricHTTPException, WorkspaceNotFoundException\n","\n","workspace_id=spark.conf.get(\"trident.workspace.id\")\n","#Instantiate the client\n","client = fabric.FabricRestClient()\n","\n","uri = f\"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/warehouses/{matching_id}\"\n","\n","\n","# Call the REST API\n","response = client.delete(uri)\n","display(response)\n"],"outputs":[],"execution_count":null,"metadata":{"jupyter":{"source_hidden":false,"outputs_hidden":false},"nteract":{"transient":{"deleting":false}},"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"0c9a011d-e4c9-4365-8f93-9cc3927d768e"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","language":"Python","display_name":"Synapse PySpark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"[email protected]"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"synapse_widget":{"version":"0.1","state":{}},"dependencies":{}},"nbformat":4,"nbformat_minor":5}
1 change: 1 addition & 0 deletions data-warehouse/notebooks/workspace size.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","source":["from notebookutils import mssparkutils\n","\n","def scan_folder(folder,total_size):\n"," #print(f\" folder = {folder}\")\n"," l0 = notebookutils.fs.ls(folder)\n"," for l1 in l0:\n"," if l1.isDir==False:\n"," #print(f\" name = {l1.name} : {l1.size/ 1024/1024:.2f} MB\")\n"," total_size = total_size + l1.size\n"," \n"," for l1 in l0: \n"," if l1.isDir:\n"," dirsize = scan_folder(l1.path, 0);\n"," #print(f\"{l1.path} : {dirsize/ 1024/1024:.2f} MB\")\n"," total_size = total_size + dirsize\n"," \n"," return total_size\n","\n","total_size=0.00;\n","\n","# Warehouse\n","#delta_table_path = f\"abfss://[workspace]@onelake.dfs.fabric.microsoft.com/[warehouse name].Datawarehouse/dbo/[table name]\"\n","\n","# lakehouse\n","#delta_table_path = \"abfss://[workspace]@onelake.dfs.fabric.microsoft.com/[lakehouse name].Lakehouse/Tables/[table name]\"\n","\n","# Workspace\n","delta_table_path = \"abfss://[workspace]@onelake.dfs.fabric.microsoft.com/\"\n","\n","total_size = scan_folder(delta_table_path, total_size)\n","total_size = total_size/ 1024/1024\n","print(f\" Total Workspace Size: {total_size:.2f} MB\")\n"],"outputs":[],"execution_count":null,"metadata":{"microsoft":{"language":"python","language_group":"synapse_pyspark"}},"id":"377c1f19-92bc-4562-a8de-f9088a4ee999"}],"metadata":{"kernel_info":{"name":"synapse_pyspark"},"kernelspec":{"name":"synapse_pyspark","display_name":"synapse_pyspark"},"language_info":{"name":"python"},"microsoft":{"language":"python","language_group":"synapse_pyspark","ms_spell_check":{"ms_spell_check_language":"en"}},"nteract":{"version":"[email protected]"},"spark_compute":{"compute_id":"/trident/default","session_options":{"conf":{"spark.synapse.nbs.session.timeout":"1200000"}}},"dependencies":{}},"nbformat":4,"nbformat_minor":5}
21 changes: 21 additions & 0 deletions open-mirroring/ExcelDemo/AppConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace ExcelDemo
{
public class AppConfig
{
public string folderToWatch { get; set; }
public string azcopyFolder { get; set; }
public string azcopyPath { get; set; }
public string outputFolder { get; set; }
public string SPN_Application_ID { get; set; }
public string SPN_Secret { get; set; }
public string SPN_Tenant_ID { get; set; }
public string MirrorLandingZone { get; set; }

}
}
10 changes: 10 additions & 0 deletions open-mirroring/ExcelDemo/AppConfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"folderToWatch": "<-- folder to watch for changes -->",
"azcopyFolder": "<-- path to the directory of AZCOPY -->",
"azcopyPath": "<-- Path to AZCOPY.exe -->",
"outputFolder": "<-- the place where the temp changes are stored -->",
"SPN_Application_ID": "<-- SPN Application ID ->>",
"SPN_Secret": "<-- SPN Secret -->",
"SPN_Tenant_ID": "<-- SPN Tenant ID -->",
"MirrorLandingZone": "<-- copy and paste the landing zone in here -->"
}
Loading

0 comments on commit 194eb9a

Please sign in to comment.