From efebfca680470b8d9c995fc81a9386bb68fdbf5d Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Thu, 26 May 2022 21:26:37 +0200 Subject: [PATCH 1/3] MMDS: Add support for lowercased headers Signed-off-by: Ives van Hoorne update test Co-authored-by: Luminita Voicu move test to valid headers --- docs/mmds/mmds-user-guide.md | 2 +- src/mmds/src/lib.rs | 6 ++--- src/mmds/src/token_headers.rs | 22 +++++++++++++++++-- .../integration_tests/functional/test_mmds.py | 2 +- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/mmds/mmds-user-guide.md b/docs/mmds/mmds-user-guide.md index c6bac05100e..5f9b3f012d0 100644 --- a/docs/mmds/mmds-user-guide.md +++ b/docs/mmds/mmds-user-guide.md @@ -249,7 +249,7 @@ The session must start with an HTTP `PUT` request that generates the session tok In order to be successful, the request must respect the following constraints: - must be directed towards `/latest/api/token` path -- must contain a `X-ametadata-token-ttl-seconds` header specifying the token lifetime +- must contain a `X-metadata-token-ttl-seconds` header specifying the token lifetime in seconds. The value cannot be lower than 1 or greater than 21600 (6 hours). - must not contain a `X-Forwarded-For` header. diff --git a/src/mmds/src/lib.rs b/src/mmds/src/lib.rs index 5360c5a0f0a..ae206ab7d6e 100644 --- a/src/mmds/src/lib.rs +++ b/src/mmds/src/lib.rs @@ -42,7 +42,7 @@ impl fmt::Display for Error { ), Error::NoTtlProvided => write!( f, - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ specify the token's lifetime." ), Error::ResourceNotFound(ref uri) => { @@ -705,8 +705,8 @@ mod tests { assert_eq!( Error::NoTtlProvided.to_string(), - "Token time to live value not found. Use `X-metadata-token-ttl_seconds` header to \ - specify the token's lifetime." + "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ + specify the token's lifetime." ); assert_eq!( diff --git a/src/mmds/src/token_headers.rs b/src/mmds/src/token_headers.rs index 5d13d9a4245..1f03d0b4f4e 100644 --- a/src/mmds/src/token_headers.rs +++ b/src/mmds/src/token_headers.rs @@ -39,12 +39,19 @@ impl TokenHeaders { /// Return `TokenHeaders` from headers map. pub fn try_from(map: &HashMap) -> Result { let mut headers = Self::default(); + let lowercased_headers: HashMap = map + .iter() + .map(|(k, v)| (k.to_lowercase(), v.clone())) + .collect(); - if let Some(token) = map.get(TokenHeaders::X_METADATA_TOKEN) { + if let Some(token) = lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN.to_lowercase()) + { headers.x_metadata_token = Some(token.to_string()); } - if let Some(value) = map.get(TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS) { + if let Some(value) = + lowercased_headers.get(&TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS.to_lowercase()) + { match value.parse::() { Ok(seconds) => { headers.x_metadata_token_ttl_seconds = Some(seconds); @@ -127,6 +134,17 @@ mod tests { let headers = TokenHeaders::try_from(&map).unwrap(); assert_eq!(*headers.x_metadata_token().unwrap(), "".to_string()); + // Lowercased headers + let mut map: HashMap = HashMap::default(); + map.insert( + TokenHeaders::X_METADATA_TOKEN_TTL_SECONDS + .to_string() + .to_lowercase(), + "60".to_string(), + ); + let headers = TokenHeaders::try_from(&map).unwrap(); + assert_eq!(headers.x_metadata_token_ttl_seconds().unwrap(), 60); + // Invalid value. let mut map: HashMap = HashMap::default(); map.insert( diff --git a/tests/integration_tests/functional/test_mmds.py b/tests/integration_tests/functional/test_mmds.py index 870bce0a70a..92f735436df 100644 --- a/tests/integration_tests/functional/test_mmds.py +++ b/tests/integration_tests/functional/test_mmds.py @@ -832,7 +832,7 @@ def test_mmds_v2_negative(test_microvm_with_api, network_config): cmd = f"curl -m 2 -s -X PUT http://{DEFAULT_IPV4}/latest/api/token" expected = ( "Token time to live value not found. Use " - "`X-metadata-token-ttl_seconds` header to specify " + "`X-metadata-token-ttl-seconds` header to specify " "the token's lifetime." ) _run_guest_cmd(ssh_connection, cmd, expected) From a43235462de3642f4fa2130c9722b60da3cd3298 Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Tue, 21 Jun 2022 12:25:56 +0200 Subject: [PATCH 2/3] Support shared mmap for running VMs This allows us to run VMs while streaming memory changes to disk support mmap shared update log add configuration for memory backing file Progress tweaks --- .../seccomp/aarch64-unknown-linux-musl.json | 13 +++ .../seccomp/x86_64-unknown-linux-musl.json | 13 +++ src/api_server/src/parsed_request.rs | 2 + .../src/request/memory_backing_file.rs | 42 ++++++++++ src/api_server/src/request/mod.rs | 1 + src/api_server/swagger/firecracker.yaml | 31 +++++++ src/logger/src/metrics.rs | 4 + src/vm-memory/src/lib.rs | 2 +- src/vmm/src/builder.rs | 60 ++++++++++---- src/vmm/src/lib.rs | 11 +-- src/vmm/src/persist.rs | 80 ++++++++++++++----- src/vmm/src/resources.rs | 14 ++++ src/vmm/src/rpc_interface.rs | 45 ++++++++--- src/vmm/src/vmm_config/memory_backing_file.rs | 31 +++++++ src/vmm/src/vmm_config/mod.rs | 2 + 15 files changed, 303 insertions(+), 48 deletions(-) create mode 100644 src/api_server/src/request/memory_backing_file.rs create mode 100644 src/vmm/src/vmm_config/memory_backing_file.rs diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index ca0e51381a5..5a2204f41d6 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -596,6 +596,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index b419581ea3f..02bc9629a6a 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -248,6 +248,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index 9b4cd6e64b9..7565853e1ab 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -17,6 +17,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use crate::request::memory_backing_file::parse_put_memory_backing_file; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -112,6 +113,7 @@ impl ParsedRequest { (Method::Put, "network-interfaces", Some(body)) => { parse_put_net(body, path_tokens.get(1)) } + (Method::Put, "memory-backing-file", Some(body)) => parse_put_memory_backing_file(body), (Method::Put, "shutdown-internal", None) => { Ok(ParsedRequest::new(RequestAction::ShutdownInternal)) } diff --git a/src/api_server/src/request/memory_backing_file.rs b/src/api_server/src/request/memory_backing_file.rs new file mode 100644 index 00000000000..e29798039bc --- /dev/null +++ b/src/api_server/src/request/memory_backing_file.rs @@ -0,0 +1,42 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use logger::{IncMetric, METRICS}; +use vmm::vmm_config::memory_backing_file::MemoryBackingFileConfig; + +pub(crate) fn parse_put_memory_backing_file(body: &Body) -> Result { + METRICS.put_api_requests.memory_backing_file_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backing_file_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backing_file(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "path": "./memory.snap" + }"#; + let same_body = MemoryBackingFileConfig { + path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backing_file(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackingFile(same_body))); + } +} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index 75f9a0daef3..c2f52110bf0 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -8,6 +8,7 @@ pub mod drive; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_backing_file; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 07d242532e6..14cae2b7a69 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -350,6 +350,29 @@ paths: description: Internal server error schema: $ref: "#/definitions/Error" + + /memory-backing-file: + put: + summary: Configures a memory backing file to sync the memory changes to during the runtime of the vm + operationId: putMemoryBackingFile + parameters: + - name: body + in: body + description: Path to memory backing file + required: true + schema: + $ref: "#/definitions/MemoryBackingFile" + responses: + 204: + description: Memory backing file configured + 400: + description: Memory backing file failed + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" /metrics: put: @@ -1047,6 +1070,14 @@ definitions: tx_rate_limiter: $ref: "#/definitions/RateLimiter" + MemoryBackingFile: + type: object + required: + - path + properties: + path: + type: string + PartialDrive: type: object required: diff --git a/src/logger/src/metrics.rs b/src/logger/src/metrics.rs index 779689f04b8..aba286cc4d8 100644 --- a/src/logger/src/metrics.rs +++ b/src/logger/src/metrics.rs @@ -403,6 +403,10 @@ pub struct PutRequestsMetrics { pub machine_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, + /// Number of PUTs for setting memory backing file. + pub memory_backing_file_cfg_count: SharedIncMetric, + /// Number of failures in configuring the machine. + pub memory_backing_file_cfg_fails: SharedIncMetric, /// Number of PUTs for initializing the metrics system. pub metrics_count: SharedIncMetric, /// Number of failures in initializing the metrics system. diff --git a/src/vm-memory/src/lib.rs b/src/vm-memory/src/lib.rs index 814f79098b4..1d18139dd99 100644 --- a/src/vm-memory/src/lib.rs +++ b/src/vm-memory/src/lib.rs @@ -117,7 +117,7 @@ pub fn create_guest_memory( for region in regions { let flags = match region.0 { None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE, + Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED, }; let mmap_region = diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index bffb43e47cf..c096d8cb132 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -5,9 +5,11 @@ use std::convert::TryFrom; use std::fmt::{Display, Formatter}; +use std::fs::File; use std::io::{self, Read, Seek, SeekFrom}; use std::os::unix::io::{AsRawFd, RawFd}; use std::sync::{Arc, Mutex}; +use vm_memory::FileOffset; use arch::InitrdConfig; #[cfg(target_arch = "x86_64")] @@ -28,7 +30,6 @@ use linux_loader::loader::KernelLoader; use logger::{error, warn, METRICS}; use seccompiler::BpfThreadMap; use snapshot::Persist; -use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::terminal::Terminal; use utils::time::TimestampUs; @@ -43,7 +44,7 @@ use crate::construct_kvm_mpidrs; use crate::device_manager::legacy::PortIODeviceManager; use crate::device_manager::mmio::MMIODeviceManager; use crate::device_manager::persist::MMIODevManagerConstructorArgs; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -58,6 +59,8 @@ use crate::{device_manager, mem_size_mib, Error, EventManager, Vmm, VmmEventsObs pub enum StartMicrovmError { /// Unable to attach block device to Vmm. AttachBlockDevice(io::Error), + /// Unable to create the memory backing file. + BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. ConfigureSystem(arch::Error), /// Internal errors are due to resource exhaustion. @@ -112,6 +115,9 @@ impl Display for StartMicrovmError { write!(f, "Unable to attach block device to Vmm: {}", err) } ConfigureSystem(err) => write!(f, "System configuration error: {:?}", err), + BackingMemoryFile(err) => { + write!(f, "Unable to create the memory backing file: {}", err) + } CreateRateLimiter(err) => write!(f, "Cannot create RateLimiter: {}", err), CreateNetDevice(err) => { let mut err_msg = format!("{:?}", err); @@ -231,7 +237,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, vcpu_count: u8, ) -> std::result::Result<(Vmm, Vec), StartMicrovmError> { @@ -297,7 +303,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, - uffd, + memory_descriptor, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -329,8 +335,23 @@ pub fn build_microvm_for_boot( let boot_config = vm_resources.boot_source().ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = - create_guest_memory(vm_resources.vm_config().mem_size_mib, track_dirty_pages)?; + + let backing_memory_file = if let Some(ref file) = vm_resources.backing_memory_file { + file.set_len((vm_resources.vm_config().mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + Some(file.clone()) + } else { + None + }; + let guest_memory = create_guest_memory( + vm_resources.vm_config().mem_size_mib, + backing_memory_file.clone(), + track_dirty_pages, + )?; let vcpu_config = vm_resources.vcpu_config(); let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; @@ -362,7 +383,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - None, + backing_memory_file.map(MemoryDescriptor::File), track_dirty_pages, vcpu_config.vcpu_count, )?; @@ -451,7 +472,7 @@ pub fn build_microvm_from_snapshot( event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -466,7 +487,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), - uffd, + memory_descriptor, track_dirty_pages, vcpu_count, )?; @@ -581,15 +602,24 @@ pub fn build_microvm_from_snapshot( /// Creates GuestMemory of `mem_size_mib` MiB in size. pub fn create_guest_memory( mem_size_mib: usize, + backing_memory_file: Option>, track_dirty_pages: bool, ) -> std::result::Result { let mem_size = mem_size_mib << 20; let arch_mem_regions = arch::arch_memory_regions(mem_size); + let mut offset = 0_u64; vm_memory::create_guest_memory( &arch_mem_regions .iter() - .map(|(addr, size)| (None, *addr, *size)) + .map(|(addr, size)| { + let file_offset = backing_memory_file + .clone() + .map(|file| FileOffset::from_arc(file, offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) .collect::>()[..], track_dirty_pages, ) @@ -1068,7 +1098,7 @@ pub mod tests { } pub(crate) fn default_vmm() -> Vmm { - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK) .map_err(Error::EventFd) @@ -1096,12 +1126,12 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, - uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, #[cfg(target_arch = "x86_64")] pio_device_manager, + memory_descriptor: None, } } @@ -1283,13 +1313,13 @@ pub mod tests { // Case 1: create guest memory without dirty page tracking { - let guest_memory = create_guest_memory(mem_size, false).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, false).unwrap(); assert!(!is_dirty_tracking_enabled(&guest_memory)); } // Case 2: create guest memory with dirty page tracking { - let guest_memory = create_guest_memory(mem_size, true).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, true).unwrap(); assert!(is_dirty_tracking_enabled(&guest_memory)); } } @@ -1297,7 +1327,7 @@ pub mod tests { #[test] fn test_create_vcpus() { let vcpu_count = 2; - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); #[allow(unused_mut)] let mut vm = setup_kvm_vm(&guest_memory, false).unwrap(); diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index d71932525b5..90975ac6acc 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -50,10 +50,10 @@ use devices::virtio::{ use devices::BusDevice; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; use logger::{error, info, warn, LoggerError, MetricsError, METRICS}; +use persist::MemoryDescriptor; use rate_limiter::BucketUpdate; use seccompiler::BpfProgram; use snapshot::Persist; -use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use vm_memory::{GuestMemory, GuestMemoryMmap, GuestMemoryRegion}; @@ -260,10 +260,6 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, - // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] - uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -272,6 +268,11 @@ pub struct Vmm { mmio_device_manager: MMIODeviceManager, #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, + + // The mem file that should be mmaped. We need to keep a reference of the UFFD in the + // process so we allow dead_code + #[allow(dead_code)] + memory_descriptor: Option, } impl Vmm { diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 309cbce46b3..0ff5ca1f9da 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -325,6 +325,27 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> std::result::Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; + if OpenOptions::new().read(true).open(mem_file_path).is_ok() + && snapshot_type == &SnapshotType::Diff + { + // // The memory file already exists. + // // We're going to use the msync behaviour + // for region in vmm.guest_memory().iter() { + // info!("msyncing memory region"); + // unsafe { + // if libc::msync(region.as_ptr() as _, region.len() as _, libc::MS_SYNC) == -1 { + // return Err(CreateSnapshotError::Memory( + // memory_snapshot::Error::CreateRegion(vm_memory::MmapRegionError::Mmap( + // std::io::Error::last_os_error(), + // )), + // )); + // } + // }; + // } + + return Ok(()); + } + let mut file = OpenOptions::new() .write(true) .create(true) @@ -483,6 +504,16 @@ pub fn snapshot_state_sanity_check( Ok(()) } +/// Describes a descriptor that connects to the memory used by the VM. This could either be the a file descriptor +/// or a UFFD descriptor. +#[derive(Debug)] +pub enum MemoryDescriptor { + /// A file descriptor that connects to the user fault process. + Uffd(Uffd), + /// A file descriptor of the backing memory file. + File(Arc), +} + /// Loads a Microvm snapshot producing a 'paused' Microvm. pub fn restore_from_snapshot( instance_info: &InstanceInfo, @@ -501,26 +532,31 @@ pub fn restore_from_snapshot( let mem_backend_path = ¶ms.mem_backend.backend_path; let mem_state = µvm_state.memory_state; let track_dirty_pages = params.enable_diff_snapshots; - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?, - None, - ), - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - )?, + let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { + MemBackendType::File => { + let (guest_memory, file) = + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages)?; + (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) + } + MemBackendType::Uffd => { + let (guest_memory, uffd) = guest_memory_from_uffd( + mem_backend_path, + mem_state, + track_dirty_pages, + // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device + // is present in the microVM state. + microvm_state.device_states.balloon_device.is_some(), + )?; + + (guest_memory, uffd.map(MemoryDescriptor::Uffd)) + } }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, - uffd, + memory_descriptor, track_dirty_pages, seccomp_filters, vm_resources, @@ -545,11 +581,19 @@ fn guest_memory_from_file( mem_file_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, -) -> std::result::Result { +) -> std::result::Result<(GuestMemoryMmap, File), LoadSnapshotError> { use self::LoadSnapshotError::{DeserializeMemory, MemoryBackingFile}; - let mem_file = File::open(mem_file_path).map_err(MemoryBackingFile)?; - GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) - .map_err(DeserializeMemory) + let mem_file = OpenOptions::new() + .write(true) + .read(true) + .open(mem_file_path) + .map_err(MemoryBackingFile)?; + + Ok(( + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages) + .map_err(DeserializeMemory)?, + mem_file, + )) } fn guest_memory_from_uffd( diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index 9a5257ac307..fb9c3b11e61 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use std::convert::From; +use std::fs::File; use std::sync::{Arc, Mutex, MutexGuard}; use logger::info; @@ -117,6 +118,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// When backed by a memory file, this should be set + pub backing_memory_file: Option>, } impl VmResources { @@ -236,6 +239,16 @@ impl VmResources { self.vm_config.track_dirty_pages = dirty_page_tracking; } + /// Returns the config for the backing memory file + pub fn backing_memory_file(&self) -> Option> { + self.backing_memory_file.clone() + } + + /// Sets the backing memory file + pub fn set_backing_memory_file(&mut self, backing_mem_file: Arc) { + self.backing_memory_file.get_or_insert(backing_mem_file); + } + /// Returns the VmConfig. pub fn vm_config(&self) -> &VmConfig { &self.vm_config @@ -575,6 +588,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, + backing_memory_file: None, } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 2a79237d3f1..87f9f2aae2a 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -2,8 +2,8 @@ // SPDX-License-Identifier: Apache-2.0 use std::fmt::{Display, Formatter}; -use std::result; use std::sync::{Arc, Mutex, MutexGuard}; +use std::{fs, result}; use logger::*; use mmds::data_store::{self, Mmds}; @@ -34,6 +34,7 @@ use crate::vmm_config::drive::{BlockDeviceConfig, BlockDeviceUpdateConfig, Drive use crate::vmm_config::instance_info::InstanceInfo; use crate::vmm_config::logger::{LoggerConfig, LoggerConfigError}; use crate::vmm_config::machine_config::{VmConfig, VmConfigError, VmUpdateConfig}; +use crate::vmm_config::memory_backing_file::{MemoryBackingFileConfig, MemoryBackingFileError}; use crate::vmm_config::metrics::{MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ @@ -99,6 +100,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), + /// Set the memory backing file for the VM. The VM will use this backing file to store its + /// memory. This action can only be called before the microVM has booted. + SetMemoryBackingFile(MemoryBackingFileConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -130,6 +134,8 @@ pub enum VmmAction { pub enum VmmActionError { /// The action `SetBalloonDevice` failed because of bad user input. BalloonConfig(BalloonConfigError), + /// The action `SetMemoryBackingFile` failed because of bad user input. + MemoryBackingFile(MemoryBackingFileError), /// The action `ConfigureBootSource` failed because of bad user input. BootSource(BootSourceConfigError), /// The action `CreateSnapshot` failed. @@ -182,6 +188,7 @@ impl Display for VmmActionError { match self { BalloonConfig(err) => err.to_string(), BootSource(err) => err.to_string(), + MemoryBackingFile(err) => err.to_string(), CreateSnapshot(err) => err.to_string(), DriveConfig(err) => err.to_string(), InternalVmm(err) => format!("Internal Vmm error: {}", err), @@ -422,6 +429,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), + SetMemoryBackingFile(config) => self.set_backing_memory_file(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), // Operations not allowed pre-boot. @@ -447,6 +455,23 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } + fn set_backing_memory_file(&mut self, cfg: MemoryBackingFileConfig) -> ActionResult { + self.boot_path = true; + let file = fs::OpenOptions::new() + .create(true) + .read(true) + .write(true) + .open(cfg.path) + .map(Arc::new) + .map_err(|e| { + VmmActionError::MemoryBackingFile(MemoryBackingFileError::CreateFile(e)) + })?; + + self.vm_resources.backing_memory_file = Some(file); + + Ok(VmmData::Empty) + } + fn insert_block_device(&mut self, cfg: BlockDeviceConfig) -> ActionResult { self.boot_path = true; self.vm_resources @@ -654,6 +679,7 @@ impl RuntimeApiController { | InsertNetworkDevice(_) | LoadSnapshot(_) | SetBalloonDevice(_) + | SetMemoryBackingFile(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | StartMicroVm @@ -720,14 +746,14 @@ impl RuntimeApiController { fn create_snapshot(&mut self, create_params: &CreateSnapshotParams) -> ActionResult { log_dev_preview_warning("Virtual machine snapshots", None); - if create_params.snapshot_type == SnapshotType::Diff - && !self.vm_resources.track_dirty_pages() - { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } + // if create_params.snapshot_type == SnapshotType::Diff + // && !self.vm_resources.track_dirty_pages() + // { + // return Err(VmmActionError::NotSupported( + // "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." + // .to_string(), + // )); + // } let mut locked_vmm = self.vmm.lock().unwrap(); let create_start_us = utils::time::get_time_us(utils::time::ClockType::Monotonic); @@ -862,6 +888,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, + pub backing_memory_file: Option>, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/memory_backing_file.rs b/src/vmm/src/vmm_config/memory_backing_file.rs new file mode 100644 index 00000000000..f5f5bd9ce6f --- /dev/null +++ b/src/vmm/src/vmm_config/memory_backing_file.rs @@ -0,0 +1,31 @@ +use std::{ + fmt::{Display, Formatter}, + io, + path::PathBuf, +}; + +use serde::{Deserialize, Serialize}; + +/// Keeps the Memory Backing file configuration. +#[derive(Clone, Debug, Deserialize, PartialEq, Serialize)] +#[serde(deny_unknown_fields)] +pub struct MemoryBackingFileConfig { + /// Location of the memory backing file. + pub path: PathBuf, +} + +/// Errors associated with the operations allowed on a memory backing file. +#[derive(Debug)] +pub enum MemoryBackingFileError { + /// Failed to create the block device + CreateFile(io::Error), +} + +impl Display for MemoryBackingFileError { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + use self::MemoryBackingFileError::*; + match self { + CreateFile(e) => write!(f, "Unable to create the memory backing file: {}", e), + } + } +} diff --git a/src/vmm/src/vmm_config/mod.rs b/src/vmm/src/vmm_config/mod.rs index c509fce888f..61b235a0fbb 100644 --- a/src/vmm/src/vmm_config/mod.rs +++ b/src/vmm/src/vmm_config/mod.rs @@ -23,6 +23,8 @@ pub mod instance_info; pub mod logger; /// Wrapper for configuring the memory and CPU of the microVM. pub mod machine_config; +/// Wrapper for configuring the backing memory file. +pub mod memory_backing_file; /// Wrapper for configuring the metrics. pub mod metrics; /// Wrapper for configuring the MMDS. From 41643714f571f281ac9777ff72254591856ea0da Mon Sep 17 00:00:00 2001 From: Ives van Hoorne Date: Wed, 31 Aug 2022 12:05:46 +0200 Subject: [PATCH 3/3] fix: VMs getting stuck when starting from snapshot Fixes https://github.com/firecracker-microvm/firecracker/issues/3043 Fixes https://github.com/firecracker-microvm/firecracker/issues/3020 --- src/cpuid/src/transformer/amd.rs | 3 +++ src/cpuid/src/transformer/common.rs | 13 +++++++++++++ src/cpuid/src/transformer/intel.rs | 2 ++ 3 files changed, 18 insertions(+) diff --git a/src/cpuid/src/transformer/amd.rs b/src/cpuid/src/transformer/amd.rs index c1db38260a5..261e88ef4e4 100644 --- a/src/cpuid/src/transformer/amd.rs +++ b/src/cpuid/src/transformer/amd.rs @@ -147,6 +147,9 @@ impl CpuidTransformer for AmdCpuidTransformer { leaf_0x8000001d::LEAF_NUM => Some(amd::update_extended_cache_topology_entry), leaf_0x8000001e::LEAF_NUM => Some(amd::update_extended_apic_id_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + + // hypervisor stuff + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } } diff --git a/src/cpuid/src/transformer/common.rs b/src/cpuid/src/transformer/common.rs index ba89fc35f74..59de658fba5 100644 --- a/src/cpuid/src/transformer/common.rs +++ b/src/cpuid/src/transformer/common.rs @@ -69,6 +69,19 @@ pub fn update_brand_string_entry( Ok(()) } +// KVM feature bits +#[cfg(target_arch = "x86_64")] +const KVM_FEATURE_ASYNC_PF_INT_BIT: u32 = 14; + +pub fn disable_kvm_feature_async_pf( + entry: &mut kvm_cpuid_entry2, + vm_spec: &VmSpec, +) -> Result<(), Error> { + entry.eax.write_bit(KVM_FEATURE_ASYNC_PF_INT_BIT, false); + + Ok(()) +} + pub fn update_cache_parameters_entry( entry: &mut kvm_cpuid_entry2, vm_spec: &VmSpec, diff --git a/src/cpuid/src/transformer/intel.rs b/src/cpuid/src/transformer/intel.rs index 8505b668932..439058c8780 100644 --- a/src/cpuid/src/transformer/intel.rs +++ b/src/cpuid/src/transformer/intel.rs @@ -126,6 +126,8 @@ impl CpuidTransformer for IntelCpuidTransformer { leaf_0xa::LEAF_NUM => Some(intel::update_perf_mon_entry), leaf_0xb::LEAF_NUM => Some(intel::update_extended_topology_entry), 0x8000_0002..=0x8000_0004 => Some(common::update_brand_string_entry), + // hypervisor stuff + 0x4000_0001 => Some(common::disable_kvm_feature_async_pf), _ => None, } }