sl-solution · dyeeee · Sep 12, 2022 · Sep 12, 2022
diff --git a/src/byrow/byrow.jl b/src/byrow/byrow.jl
@@ -6,6 +6,8 @@ nunique(::_DUMMY_STRUCT) =  false
 stdze!(::_DUMMY_STRUCT) = false
 stdze(::_DUMMY_STRUCT) = false
 select(::_DUMMY_STRUCT) = false
+rescale(::_DUMMY_STRUCT) = false
+rescale(::_DUMMY_STRUCT) = false
 
 byrow(ds::AbstractDataset, ::typeof(Base.sum), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); by = identity, threads = nrow(ds) > Threads.nthreads()*10) = row_sum(ds, by, cols, threads = threads)
 byrow(ds::AbstractDataset, ::typeof(Base.sum), col::ColumnIndex; by = identity, threads = nrow(ds) > Threads.nthreads()*10) = byrow(ds, sum, [col]; by = by, threads = threads)
@@ -225,6 +227,10 @@ byrow(ds::AbstractDataset, ::typeof(stdze), cols::MultiColumnIndex = names(ds, U
 
 byrow(ds::AbstractDataset, ::typeof(stdze!), cols::MultiColumnIndex = names(ds, Union{Missing, Number}); threads = true) = row_stdze!(ds, cols, threads = threads)
 
+byrow(ds::AbstractDataset, ::typeof(rescale), cols::MultiColumnIndex=names(ds, Union{Missing,Number}); range=[0, 1], threads=true) = row_rescale(ds, cols, range=range, threads=threads)
+
+byrow(ds::AbstractDataset, ::typeof(rescale!), cols::MultiColumnIndex=names(ds, Union{Missing,Number}); range=[0, 1], threads=true) = row_rescale!(ds, cols, range=range, threads=threads)
+
 function byrow(ds::AbstractDataset, ::typeof(hash), cols::MultiColumnIndex = :; by = identity, mapformats = false, threads = nrow(ds) > Threads.nthreads()*10)
 	colsidx = multiple_getindex(index(ds), cols)
 	if mapformats

diff --git a/src/byrow/doc.jl b/src/byrow/doc.jl
@@ -78,6 +78,10 @@ function Docs.getdoc(x::typeof(byrow), y)
         return _get_doc_byrow("stdze!")
     elseif y == Tuple{typeof(stdze)}
         return _get_doc_byrow("stdze")
+    elseif y == Tuple{typeof(rescale!)}
+        return _get_doc_byrow("rescale!")
+    elseif y == Tuple{typeof(rescale)}
+        return _get_doc_byrow("rescale")
     else
         return _get_doc_byrow("generic")
     end
@@ -145,6 +149,9 @@ Perform a row-wise operation specified by `fun` on selected columns `cols`. Gene
 - `sort!`
 - `stdze`
 - `stdze!`
+- `rescale`
+- `rescale!`
+
 @@@@sum@@@@
     byrow(ds::AbstractDataset, sum, cols = names(ds, Number); [by = identity, threads])
 
@@ -1287,6 +1294,28 @@ julia> byrow(ds,stdze!,:)
     byrow(ds::AbstractDataset, stdze, cols; [threads])
 
 Variant of `byrow(stdze!)` which pass a copy of `ds` and leave `ds` untouched.
+
+@@@@rescale!@@@@
+    byrow(ds::Dataset, rescale!, cols; [range = [0, 1], threads])
+
+Replace each value in each row of `ds` for selected `cols` by its rescaled values. 
+Also known as min-max scaling or min-max normalization, rescaling is the simplest method and consists in rescaling the range of features to scale the range.
+The formula to rescale a range between an arbitrary set of values [a, b] is given as: a + ((x-min(x))(b-a)/(max(x)-min(x)). 
+
+Missing values are skipped from the calculation. When all values in a row are missing, it returns `missing`.
+If the maximum value of a row is equal to the minimum value of a row, the result will also be `missing`.
+
+
+Passing `range = [minval, mxval]` to define the range of rescale result.
+Passing `threads = false` disables multithreaded computations.
+
+See [`byrow(rescale)`](@ref)
+
+@@@@rescale@@@@
+    byrow(ds::AbstractDataset, rescale, cols; [range = [0, 1], threads])
+
+Variant of `byrow(rescale!)` which pass a copy of `ds` and leave `ds` untouched.
+
 @@@@generic@@@@
     byrow(ds::AbstractDataset, fun, cols; [threads])
 

diff --git a/src/byrow/row_functions.jl b/src/byrow/row_functions.jl
@@ -1002,6 +1002,31 @@ function row_stdze(ds::AbstractDataset , cols = names(ds, Union{Missing, Number}
     dscopy
 end
 
+function row_rescale!(ds::Dataset, cols=names(ds, Union{Missing,Number}); range, threads=true)
+  colsidx = IMD.index(ds)[cols]
+
+  mindata = IMD.row_minimum(ds, colsidx; threads=threads)
+  maxdata = IMD.row_maximum(ds, colsidx; threads=threads)
+  max_min = maxdata .- mindata
+
+  _rescale_fun(x) = ifelse.(isequal.(max_min, 0), missing, range[1] .+ (((x .- mindata) .* (range[2] - range[1])) ./ max_min))
+
+  for i in 1:length(colsidx)
+    IMD._columns(ds)[colsidx[i]] = _rescale_fun(IMD._columns(ds)[colsidx[i]])
+  end
+  removeformat!(ds, colsidx)
+  any(IMD.index(ds).sortedcols .∈ Ref(colsidx)) && IMD._reset_grouping_info!(ds)
+  IMD._modified(IMD._attributes(ds))
+  ds
+end
+
+function row_rescale(ds::AbstractDataset, cols=names(ds, Union{Missing,Number}); range, threads=true)
+  dscopy = copy(ds)
+  row_rescale!(dscopy, cols; range=range, threads=threads)
+  dscopy
+end
+
+
 function row_sort!(ds::Dataset, cols = names(ds, Union{Missing, Number}); kwargs...)
     colsidx = index(ds)[cols]
     T = mapreduce(eltype, promote_type, eachcol(ds)[colsidx])