remove sync impl for examples and workspace, update tool (#12)

jackgerrits · web-flow · commit 24d1b3f9f430 · 2022-09-22T14:37:28.000-04:00
* remove sync impl for examples

* move thread unsafety to tool itself

* update tool impl
diff --git a/tool/src/main.rs b/tool/src/main.rs
@@ -31,6 +31,8 @@ struct Cli {
 enum Commands {
     /// Train a VW model file. Currently only single pass and DSJSON input format is supported.
     Train(Train),
+    /// Train a VW model file without any parallelization. Currently only single pass and DSJSON input format is supported.
+    TrainOneThread(TrainOneThread),
 }
 
 #[derive(Parser, Debug)]
@@ -89,6 +91,41 @@ struct Train {
     parse_threads: usize,
 }
 
+#[derive(Parser, Debug)]
+struct TrainOneThread {
+    #[clap(
+        short,
+        long,
+        parse(from_os_str),
+        help = "List of input files to process"
+    )]
+    input: Vec<PathBuf>,
+
+    #[clap(long, value_enum, default_value_t = InputFormat::Dsjson, help="Input format to interpret input files as")]
+    input_format: InputFormat,
+
+    #[clap(
+        short,
+        long,
+        parse(from_os_str),
+        help = "If provided, writes the final trained model to this file"
+    )]
+    output_model: Option<PathBuf>,
+
+    #[clap(
+        long,
+        parse(from_os_str),
+        help = "If provided, writes the final trained model as a readable model to this file. This is the same format as VW's --readable_model ..."
+    )]
+    readable_model: Option<PathBuf>,
+
+    #[clap(
+        long,
+        help = "VW arguments to use for model training. Some arguments are not permitted as they are for driver configuration in VW or managed by this tool. For example you cannot supply --data yourself."
+    )]
+    model_args: Option<String>,
+}
+
 fn process_command_line(input: Option<String>) -> Result<Vec<String>> {
     let mut vw_args = match input {
         Some(value) => shlex::Shlex::new(&value).collect(),
@@ -194,26 +231,70 @@ fn process_command_line(input: Option<String>) -> Result<Vec<String>> {
     Ok(vw_args)
 }
 
+pub struct UnsafeWorkspaceWrapper {
+    pub workspace: UnsafeCell<Workspace>,
+}
+
+impl UnsafeWorkspaceWrapper {
+    pub fn as_ref(&self) -> &Workspace {
+        unsafe { self.workspace.get().as_ref().unwrap() }
+    }
+
+    pub fn as_mut(&self) -> &mut Workspace {
+        unsafe { self.workspace.get().as_mut().unwrap() }
+    }
+}
+
+unsafe impl Send for UnsafeWorkspaceWrapper {}
+unsafe impl Sync for UnsafeWorkspaceWrapper {}
+
+fn train_one_thread(args: TrainOneThread) -> Result<()> {
+    let vw_args = process_command_line(args.model_args)?;
+    let pool = ExamplePool::new();
+    let mut workspace = Workspace::new(&vw_args)
+        .with_context(|| format!("Failed to create workspace with args {:?}", vw_args))?;
+
+    for file in args.input {
+        let file = File::open(file).expect("Failed to open file");
+        for line in io::BufReader::new(file).lines() {
+            let mut ex =
+                workspace.setup(workspace.parse_decision_service_json(&line.unwrap(), &pool)?)?;
+            workspace.learn(&mut ex)?;
+            workspace.record_stats(&mut ex)?;
+            pool.return_example(ex);
+        }
+    }
+    workspace.end_pass()?;
+
+    if let Some(model_file) = args.output_model {
+        fs::write(model_file, &*workspace.serialize_model()?)?;
+    }
+
+    if let Some(model_file) = args.readable_model {
+        fs::write(model_file, workspace.serialize_readable_model()?)?;
+    }
+
+    Ok(())
+}
+
 fn train(args: Train) -> Result<()> {
     rayon::ThreadPoolBuilder::new()
         .num_threads(args.parse_threads)
         .build_global()?;
 
     let vw_args = process_command_line(args.model_args)?;
 
-    // TODO process illegal options.
-
     let pool = ExamplePool::new();
 
-    // We use an unsafe cell, because parse_decision_service_json, and the learning code does not interact.
-    let workspace: UnsafeCell<Workspace> = Workspace::new(&vw_args)
+    let unsafe_workspace_cell: UnsafeCell<Workspace> = Workspace::new(&vw_args)
         .with_context(|| format!("Failed to create workspace with args {:?}", vw_args))?
         .into();
+    // We use an unsafe cell, because parse_decision_service_json, and the learning code does not interact.
+    let shareable_workspace: UnsafeWorkspaceWrapper = UnsafeWorkspaceWrapper {
+        workspace: unsafe_workspace_cell,
+    };
     let (tx, rx) = flume::bounded(args.queue_size);
 
-    let ws_ref = unsafe { workspace.get().as_ref().unwrap() };
-    let ws = unsafe { workspace.get().as_mut().unwrap() };
-
     std::thread::scope(|s| -> Result<()> {
         s.spawn(|| {
             for file in args.input {
@@ -233,7 +314,11 @@ fn train(args: Train) -> Result<()> {
                     }
                     let output_lines: Vec<_> = batch
                         .into_par_iter()
-                        .map(|line| ws_ref.parse_decision_service_json(&line, &pool))
+                        .map(|line| {
+                            shareable_workspace
+                                .as_ref()
+                                .parse_decision_service_json(&line, &pool)
+                        })
                         .collect();
 
                     for line in output_lines {
@@ -247,29 +332,33 @@ fn train(args: Train) -> Result<()> {
             std::mem::drop(tx);
         });
 
+        let unsafe_workspace_ref = shareable_workspace.as_mut();
+
         loop {
             // TODO consider skipping broken examples.
             let res = rx.recv();
             match res {
                 Ok(line) => {
-                    let mut ex = ws.setup(line?)?;
-                    ws.learn(&mut ex)?;
-                    ws.record_stats(&mut ex)?;
+                    let mut ex = unsafe_workspace_ref.setup(line?)?;
+                    unsafe_workspace_ref.learn(&mut ex)?;
+                    unsafe_workspace_ref.record_stats(&mut ex)?;
                     pool.return_example(ex);
                 }
                 // Sender has been dropped. Stop here.
                 Err(_) => break,
             }
         }
-        ws.end_pass()?;
+        unsafe_workspace_ref.end_pass()?;
         Ok(())
     })?;
+
+    let unsafe_workspace_ref = shareable_workspace.as_ref();
     if let Some(model_file) = args.output_model {
-        fs::write(model_file, &*ws_ref.serialize_model()?)?;
+        fs::write(model_file, &*unsafe_workspace_ref.serialize_model()?)?;
     }
 
     if let Some(model_file) = args.readable_model {
-        fs::write(model_file, ws_ref.serialize_readable_model()?)?;
+        fs::write(model_file, unsafe_workspace_ref.serialize_readable_model()?)?;
     }
 
     Ok(())
@@ -290,5 +379,17 @@ fn main() -> Result<()> {
             }
             train(args)
         }
+        Commands::TrainOneThread(args) => {
+            if args.input.is_empty() {
+                let mut app = Cli::into_app();
+                let sub = app
+                    .find_subcommand_mut("train")
+                    .expect("train must exist as a subcommand");
+                sub.print_help()?;
+                return Err(anyhow!("At least 1 input file is required."));
+                // return;
+            }
+            train_one_thread(args)
+        }
     }
 }
diff --git a/vowpalwabbit/src/example.rs b/vowpalwabbit/src/example.rs
@@ -40,9 +40,7 @@ pub struct RawExample {
 }
 
 unsafe impl Send for Example {}
-unsafe impl Sync for Example {}
 unsafe impl Send for RawExample {}
-unsafe impl Sync for RawExample {}
 impl RawExample {
     pub fn new() -> RawExample {
         unsafe {
diff --git a/vowpalwabbit/src/multi_example.rs b/vowpalwabbit/src/multi_example.rs
@@ -24,9 +24,7 @@ pub struct RawMultiExample {
 }
 
 unsafe impl Send for MultiExample {}
-unsafe impl Sync for MultiExample {}
 unsafe impl Send for RawMultiExample {}
-unsafe impl Sync for RawMultiExample {}
 
 impl RawMultiExample {
     pub fn new() -> RawMultiExample {
diff --git a/vowpalwabbit/src/workspace.rs b/vowpalwabbit/src/workspace.rs
@@ -15,7 +15,6 @@ pub struct Workspace {
 }
 
 unsafe impl Send for Workspace {}
-unsafe impl Sync for Workspace {}
 
 unsafe fn get_action_scores_or_probs(pred_ptr: *mut c_void) -> Vec<(u32, f32)> {
     let mut length = MaybeUninit::<size_t>::zeroed();

Original file line number	Diff line number	Diff line change
`@@ -40,9 +40,7 @@ pub struct RawExample {`
`40`	`40`	`}`
`41`	`41`
`42`	`42`	`unsafe impl Send for Example {}`
`43`		`-unsafe impl Sync for Example {}`
`44`	`43`	`unsafe impl Send for RawExample {}`
`45`		`-unsafe impl Sync for RawExample {}`
`46`	`44`	`impl RawExample {`
`47`	`45`	`pub fn new() -> RawExample {`
`48`	`46`	`unsafe {`
Original file line number	Diff line number	Diff line change
`@@ -24,9 +24,7 @@ pub struct RawMultiExample {`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`unsafe impl Send for MultiExample {}`
`27`		`-unsafe impl Sync for MultiExample {}`
`28`	`27`	`unsafe impl Send for RawMultiExample {}`
`29`		`-unsafe impl Sync for RawMultiExample {}`
`30`	`28`
`31`	`29`	`impl RawMultiExample {`
`32`	`30`	`pub fn new() -> RawMultiExample {`
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,6 @@ pub struct Workspace {`
`15`	`15`	`}`
`16`	`16`
`17`	`17`	`unsafe impl Send for Workspace {}`
`18`		`-unsafe impl Sync for Workspace {}`
`19`	`18`
`20`	`19`	`unsafe fn get_action_scores_or_probs(pred_ptr: *mut c_void) -> Vec<(u32, f32)> {`
`21`	`20`	`let mut length = MaybeUninit::<size_t>::zeroed();`