Python package for data access in rust (#25)

* merging bioengineering code for ML pipelines (#24) * adding build using binary downloads (#8) * adding build using binary downloads * sorting out the build.rs * updating build.rs for surrealml package * prepping version for release * now has target tracking (#10) * adding check in build.rs for docs.rs * removing build.rs for main surrealml to ensure that libraries using the core do not need to do anything in their build.rs * Kings college london integration (#23) * adding machine learning pipelines for bioengineering projects at Kings College London * Remove integrated_training_runner/run_env/ from tracking * adding machine learning pipelines for bioengineering projects at Kings College London * adding python bindings to data access for ML training
surrealdb · Feb 24, 2024 · 4c490d4 · 4c490d4
1 parent 3a83a30
commit 4c490d4
Show file tree

Hide file tree

Showing 18 changed files with 132 additions and 55 deletions.
diff --git a/modules/pipelines/data_access/Cargo.toml b/modules/pipelines/data_access/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "data-access-layer"
+version = "0.1.0"
+edition = "2021"
+
+
+[lib]
+name = "data_access_layer"
+crate-type = ["cdylib", "rlib"]
+
+
+[features]
+default = ["python"]
+python = ["pyo3"]
+
+
+[dependencies]
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+image = "0.24.8"
+# "abi3-py38" tells pyo3 (and maturin) to build using the stable ABI with minimum Python version 3.8
+pyo3 = { version = "0.20.0", features = ["abi3-py38"], optional = true}
diff --git a/modules/pipelines/data_access/README.md b/modules/pipelines/data_access/README.md
@@ -3,9 +3,26 @@
 Here is where we house libraries that handle the reading and writing of data. For now we are merely just reading from jpeg files. However,
 we will moved onto support for networking interfaces.
 
-# Basic
+# Python Bindings
 
-Basic is a library that handles the reading of jpeg files. It is a simple library that is used to read in the jpeg files and convert them
+Although this library is written in Rust we have the ability to use it in Python. This is done by using the `pyo3` crate. We can install
+the library using a `pip install` and pointing to the directory of this readme file. Once the library is installed we can use it in the
+following way:
+
+```python
+from data_access_layer.data_access_layer import read_rgb_image
+
+
+def main():
+    height = 480
+    width = 853
+    data = read_rgb_image("./assets/test.jpg", width, height)
+    print(f"\n\nThe image has {len(data)} pixels\n\n")
+```
+
+# Image Loading
+
+This library handles the reading of jpeg files. It is a simple library that is used to read in the jpeg files and convert them
 to a stream of bytes. The loading and conversion of the jpeg files is done in the `data_access/basic/src/images.rs`.
 For our images we are handling data in the following outline: 
 
@@ -68,8 +85,8 @@ can be seen in the file `engines/pytorch_train/tests/test_numpy_quality_control.
 
 # Networking
 
-At this point in time we are just handling image files in the `basic` module in rust, and piping this data into the
-python pytorch engine as seen in the following example:
+If you just want to use the raw rust binary for ML training, you can directly call the rust binary that loads the images,
+and pipe this data into the python pytorch engine as seen in the following example:
 
 ```bash
 ./data_access_rust_bin | python pytorch_engine.py

diff --git a/modules/pipelines/data_access/basic/Cargo.toml b/modules/pipelines/data_access/basic/Cargo.toml
diff --git a/modules/pipelines/data_access/basic/src/lib.rs b/modules/pipelines/data_access/basic/src/lib.rs
diff --git a/modules/pipelines/data_access/basic/src/srt_receiver.rs b/modules/pipelines/data_access/basic/src/srt_receiver.rs
diff --git a/modules/pipelines/data_access/pyproject.toml b/modules/pipelines/data_access/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["maturin>=1.0,<2.0"]
+build-backend = "maturin"
+
+[tool.maturin]
+# "extension-module" tells pyo3 we want to build an extension module (skips linking against libpython.so)
+features = ["pyo3/extension-module"]
diff --git a/...pelines/data_access/basic/src/bin_pack.rs → ...les/pipelines/data_access/src/bin_pack.rs b/...pelines/data_access/basic/src/bin_pack.rs → ...les/pipelines/data_access/src/bin_pack.rs
diff --git a/...pipelines/data_access/basic/src/images.rs → modules/pipelines/data_access/src/images.rs b/...pipelines/data_access/basic/src/images.rs → modules/pipelines/data_access/src/images.rs
@@ -114,7 +114,7 @@ mod tests {
 
     #[test]
     fn test_read_image() {
-        let _data = read_rgb_image("../data_stash/images/169_6300.jpg".to_string(), 480, 853);
+        let _data = read_rgb_image("./data_stash/images/169_6300.jpg".to_string(), 480, 853);
     }
 
     #[test]
@@ -152,7 +152,7 @@ mod tests {
 
     #[test]
     fn test_test_calculate_rgb_index_quality_control() {
-        let raw_data = std::fs::read_to_string("../data_stash/images/dummy_rgb_data.json").unwrap();
+        let raw_data = std::fs::read_to_string("./data_stash/images/dummy_rgb_data.json").unwrap();
         let data: DummyJson = serde_json::from_str(&raw_data).unwrap();
 
         // This will give x y chunks of 50 and an entire rgb image of 150

diff --git a/modules/pipelines/data_access/src/lib.rs b/modules/pipelines/data_access/src/lib.rs
@@ -0,0 +1,18 @@
+pub mod images;
+pub mod srt_reciever;
+pub mod tags;
+
+#[cfg(feature = "python")]
+pub mod python_api;
+
+#[cfg(feature = "python")]
+use pyo3::prelude::*;
+
+
+#[cfg(feature = "python")]
+#[pymodule]
+fn data_access_layer(_py: Python, m: &PyModule) -> PyResult<()> {
+    m.add_function(wrap_pyfunction!(python_api::read_rgb_image, m)?)?;
+    Ok(())
+}
+
diff --git a/modules/pipelines/data_access/src/python_api.rs b/modules/pipelines/data_access/src/python_api.rs
@@ -0,0 +1,13 @@
+//! Here we define the Python API for data access layer. This is currently just one file, but we will probably
+//! expand this module into a directory with multiple files as the project grows as we will also need to define
+//! python classes in the future when building out the tags for the surgery steps.
+use crate::images::read_rgb_image as read_rgb_image_rust;
+
+use pyo3::prelude::*;
+
+
+#[pyfunction]
+pub fn read_rgb_image(path: String, width: usize, height: usize) -> PyResult<Vec<u8>> {
+    let data = read_rgb_image_rust(path, width, height);
+    Ok(data)
+}
diff --git a/modules/pipelines/data_access/src/srt_reciever.rs b/modules/pipelines/data_access/src/srt_reciever.rs
diff --git a/...s/pipelines/data_access/basic/src/tags.rs → modules/pipelines/data_access/src/tags.rs b/...s/pipelines/data_access/basic/src/tags.rs → modules/pipelines/data_access/src/tags.rs
@@ -136,7 +136,7 @@ mod tests {
 
     #[test]
     fn test_read_tags() {
-        let tags = read_tags("../data_stash/cleaned_labels/VID68_processed.json").unwrap();
+        let tags = read_tags("./data_stash/cleaned_labels/VID68_processed.json").unwrap();
         let data = parse_surgery_steps(tags);
         assert_eq!(SurgeryStep::ClippingAndCutting, data.get("968").unwrap()[0]);
     }

diff --git a/modules/pipelines/runners/basic_training_runner/Cargo.toml b/modules/pipelines/runners/basic_training_runner/Cargo.toml
@@ -6,5 +6,5 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-basic = { path = "../../data_access/basic" }
+data-access-layer = { path = "../../data_access" }
 
diff --git a/modules/pipelines/runners/basic_training_runner/src/main.rs b/modules/pipelines/runners/basic_training_runner/src/main.rs
@@ -1,8 +1,8 @@
-use basic::images::{
+use data_access_layer::images::{
     read_rgb_image,
     write_frame_to_std_out,
 };
-use basic::tags::SurgeryStep;
+use data_access_layer::tags::SurgeryStep;
 
 
 fn main() {

diff --git a/modules/pipelines/runners/integrated_training_runner/README.md b/modules/pipelines/runners/integrated_training_runner/README.md
@@ -0,0 +1,16 @@
+# Integrated Runner
+
+This is where the image loading is still written in Rust, but there a python bindings meaning that the ML engineer
+can train the model in pure python importing and using the Rust code as a python library. This is done my merely
+pointing to the `data_access` library and performing a pip install.
+
+# Running the runner
+
+We can run the runner by running the following command:
+
+```bash
+sh ./scripts/run.sh
+```
+
+This will import the Rust image loading code which will then load the image giving us the raw resized and flatterned
+image data in python. This can be directly fed into the ML model for training.
diff --git a/modules/pipelines/runners/integrated_training_runner/assets/test.jpg b/modules/pipelines/runners/integrated_training_runner/assets/test.jpg
diff --git a/modules/pipelines/runners/integrated_training_runner/scripts/run.sh b/modules/pipelines/runners/integrated_training_runner/scripts/run.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# navigate to directory
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $SCRIPTPATH
+
+cd ..
+
+if [ -d run_env ]; then
+    rm -rf run_env
+fi
+
+python3 -m venv run_env
+source run_env/bin/activate
+
+pip install ../../data_access
+python src/main.py
diff --git a/modules/pipelines/runners/integrated_training_runner/src/main.py b/modules/pipelines/runners/integrated_training_runner/src/main.py
@@ -0,0 +1,12 @@
+from data_access_layer.data_access_layer import read_rgb_image
+
+
+def main():
+    height = 480
+    width = 853
+    data = read_rgb_image("./assets/test.jpg", width, height)
+    print(f"\n\nThe image has {len(data)} pixels\n\n")
+
+
+if __name__ == "__main__":
+    main()