-
Notifications
You must be signed in to change notification settings - Fork 16
Optimise Repr::clone and clone_from for the inline case #69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
DRMacIver
wants to merge
5
commits into
cmpute:master
Choose a base branch
from
DRMacIver:opt/p6-clone-fast-path
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
21fdad1
Optimise Repr::clone for the inline case
DRMacIver 6951b5b
Inline Repr::clone_from and tighten its fast path
DRMacIver 314b426
Optimise heap clone_heap path; drop the cold annotation
DRMacIver 402c867
Add clone regression test; satisfy upstream rustfmt and clippy
DRMacIver 59f6344
Trim verbose and profiling-context comments in clone path
DRMacIver File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -443,6 +443,56 @@ impl Repr { | |
| self | ||
| } | ||
|
|
||
| /// Slow path for `Clone::clone`: heap-resident values only (|capacity| > 2). | ||
| /// | ||
| /// Allocates directly instead of going through `Buffer::allocate` + | ||
| /// `push_slice` + `mem::transmute`, skipping the redundant capacity/length | ||
| /// bound checks and the sign-flip dance. Kept `#[inline(never)]` so the | ||
| /// inline fast path in `clone` stays small. | ||
| #[inline(never)] | ||
| fn clone_heap(&self) -> Self { | ||
| debug_assert!(self.capacity.get().unsigned_abs() > 2); | ||
| // SAFETY: abs(capacity) > 2 ⇒ the `heap` variant of the union | ||
| // is the live one, and `len >= 3`. | ||
| unsafe { | ||
| let (src_ptr, len) = self.data.heap; | ||
| debug_assert!(len >= 3); | ||
|
|
||
| // Mirrors `Buffer::default_capacity(len)`. `len <= | ||
| // Buffer::MAX_CAPACITY` is invariant for any heap Repr, and | ||
| // `default_capacity` is monotone, so `new_cap` is bounded | ||
| // and the bound check inside `Buffer::allocate_raw` would | ||
| // never fire. | ||
| let new_cap = len + len / 8 + 2; | ||
| debug_assert!((3..=Buffer::MAX_CAPACITY).contains(&new_cap)); | ||
|
|
||
| let layout = core::alloc::Layout::from_size_align_unchecked( | ||
| new_cap * mem::size_of::<Word>(), | ||
| mem::align_of::<Word>(), | ||
| ); | ||
| let new_ptr = alloc::alloc::alloc(layout) as *mut Word; | ||
| if new_ptr.is_null() { | ||
| crate::error::panic_out_of_memory(); | ||
| } | ||
|
|
||
| ptr::copy_nonoverlapping(src_ptr, new_ptr, len); | ||
|
|
||
| // Set the result's signed capacity in one shot so we never | ||
| // build a positive Repr only to negate it back. | ||
| let signed_cap = if self.capacity.get() < 0 { | ||
| -(new_cap as isize) | ||
| } else { | ||
| new_cap as isize | ||
| }; | ||
| Repr { | ||
| data: ReprData { | ||
| heap: (new_ptr, len), | ||
| }, | ||
| capacity: NonZeroIsize::new_unchecked(signed_cap), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Returns a number representing sign of self. | ||
| /// | ||
| /// * [Self::zero] if the number is zero | ||
|
|
@@ -461,43 +511,34 @@ impl Repr { | |
|
|
||
| // Cloning for Repr is written in a verbose way because it's performance critical. | ||
| impl Clone for Repr { | ||
| #[inline] | ||
| fn clone(&self) -> Self { | ||
| let (capacity, sign) = self.sign_capacity(); | ||
|
|
||
| // SAFETY: see the comments inside the block | ||
| let new = unsafe { | ||
| // inline the data if the length is less than 3 | ||
| // SAFETY: we check the capacity before accessing the variants | ||
| if capacity <= 2 { | ||
| Repr { | ||
| data: ReprData { | ||
| inline: self.data.inline, | ||
| }, | ||
| // SAFETY: the capacity is from self, which guarantees it to be zero | ||
| capacity: NonZeroIsize::new_unchecked(capacity as isize), | ||
| } | ||
| } else { | ||
| let (ptr, len) = self.data.heap; | ||
| // SAFETY: len is at least 2 when it's heap allocated (invariant of Repr) | ||
| let mut new_buffer = Buffer::allocate(len); | ||
| new_buffer.push_slice(slice::from_raw_parts(ptr, len)); | ||
|
|
||
| // SAFETY: abs(self.capacity) >= 3 => self.data.len >= 3 | ||
| // so the capacity and len of new_buffer will be both >= 3 | ||
| mem::transmute::<Buffer, Repr>(new_buffer) | ||
| } | ||
| }; | ||
| new.with_sign(sign) | ||
| // Inline case (|capacity| <= 2): copy the union and signed capacity | ||
| // verbatim, avoiding the `sign_capacity` + `with_sign` round-trip the | ||
| // old implementation did. The heap path is split into `clone_heap`. | ||
| if self.capacity.get().unsigned_abs() <= 2 { | ||
| return Repr { | ||
| data: ReprData { | ||
| // SAFETY: capacity in {-2,-1,1,2} ⇒ the `inline` | ||
| // variant of the union is the live one. | ||
| inline: unsafe { self.data.inline }, | ||
| }, | ||
| capacity: self.capacity, | ||
| }; | ||
| } | ||
| self.clone_heap() | ||
| } | ||
|
|
||
| #[inline] | ||
| fn clone_from(&mut self, src: &Self) { | ||
| let (src_cap, src_sign) = src.sign_capacity(); | ||
| let (cap, _) = self.sign_capacity(); | ||
| let src_cap_raw = src.capacity.get(); | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there is a difference before and after your change on this function. The only change that possibly makes a difference is adding the |
||
| let cap = self.capacity.get().unsigned_abs(); | ||
|
|
||
| // SAFETY: see the comments inside the block | ||
| unsafe { | ||
| // shortcut for inlined data | ||
| if src_cap <= 2 { | ||
| // Fast path: src is inline. Copy union + signed capacity, with a | ||
| // rare dealloc when self was previously heap-resident. | ||
| if src_cap_raw.unsigned_abs() <= 2 { | ||
| if cap > 2 { | ||
| // release the old buffer if necessary | ||
| // SAFETY: self.data.heap.0 must be valid pointer if cap > 2 | ||
|
|
@@ -507,6 +548,11 @@ impl Clone for Repr { | |
| self.capacity = src.capacity; | ||
| return; | ||
| } | ||
| let src_sign = if src_cap_raw > 0 { | ||
| Sign::Positive | ||
| } else { | ||
| Sign::Negative | ||
| }; | ||
|
|
||
| // SAFETY: we checked that abs(src.capacity) > 2 | ||
| let (src_ptr, src_len) = src.data.heap; | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,53 @@ | ||
| mod helper_macros; | ||
|
|
||
| #[test] | ||
| fn test_clone_inline_and_heap() { | ||
| // value (and sign) preserved across clone, for inline and heap reprs | ||
| let ubigs = [ | ||
| ubig!(0), | ||
| ubig!(5), | ||
| ubig!(0x10000000000000000), // 2^64, two-word inline | ||
| ubig!(1) << 200, // heap | ||
| (ubig!(1) << 256) - ubig!(1), // heap, all ones | ||
| ]; | ||
| for v in &ubigs { | ||
| assert_eq!(&v.clone(), v); | ||
| } | ||
| let ibigs = [ | ||
| ibig!(0), | ||
| ibig!(5), | ||
| ibig!(-5), | ||
| ibig!(1) << 200, | ||
| -(ibig!(1) << 200), | ||
| ]; | ||
| for v in &ibigs { | ||
| assert_eq!(&v.clone(), v); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_clone_from_transitions() { | ||
| // clone_from across every inline/heap combination of (dst, src) — including | ||
| // dst heap-allocated with an inline src (dst's buffer must be freed) and | ||
| // inline dst with a heap src (must allocate). | ||
| let vals = [ | ||
| ubig!(0), | ||
| ubig!(7), | ||
| ubig!(0x10000000000000000), // inline cap 2 | ||
| ubig!(1) << 130, // heap | ||
| ubig!(1) << 200, // heap, larger | ||
| ]; | ||
| for src in &vals { | ||
| for dst0 in &vals { | ||
| let mut dst = dst0.clone(); | ||
| dst.clone_from(src); | ||
| assert_eq!(&dst, src, "clone_from {dst0} <- {src}"); | ||
| } | ||
| } | ||
| // signed: heap negative -> inline negative -> heap positive | ||
| let mut x = -(ibig!(1) << 200); | ||
| x.clone_from(&ibig!(-3)); | ||
| assert_eq!(x, ibig!(-3)); | ||
| x.clone_from(&(ibig!(1) << 200)); | ||
| assert_eq!(x, ibig!(1) << 200); | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function indeed combines buffer allocate + push_slice + transmute into a single function. But there is a
Buffer::allocate_exactfunction existing, I insist to use Buffer's function for allocation, in case in future we plan to improve the allocators used for Buffer. Don't directly call alloc here.For skipping the sign check, I think it's fair to flatten it out, but it can still be achieved by create the Buffer first, and then access and flip the
capacityfield if necessary.