diff --git a/src/contrasts.jl b/src/contrasts.jl index c8461a3e..2f3de45b 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -227,8 +227,13 @@ function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Intege levels[not_base] end -Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} = - getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) +function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} + # allow rows to be missing + rows = get.(Ref(contrasts.invindex), rowinds, missing) + # create a row of nothing but missings for missing values + mrow = reduce(vcat, [missing for c in getindex(contrasts.matrix, 1, colinds)]) + vcat([r === missing ? mrow : getindex(contrasts.matrix, r, colinds) for r in rows]) +end # Making a contrast type T only requires that there be a method for # contrasts_matrix(T, baseind, n) and optionally termnames(T, levels, baseind) diff --git a/src/modelframe.jl b/src/modelframe.jl index c5759ebc..ec699983 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -71,11 +71,11 @@ missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable = function ModelFrame(f::FormulaTerm, data::ColumnTable; model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M - data, _ = missing_omit(data, f) - sch = schema(f, data, contrasts) f = apply_schema(f, sch, M) - + + data, _ = missing_omit(data, f) + ModelFrame(f, sch, data, model) end diff --git a/src/schema.jl b/src/schema.jl index cb0bd698..631dfcba 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -60,7 +60,7 @@ Base.haskey(schema::Schema, key) = haskey(schema.schema, key) Compute all the invariants necessary to fit a model with `terms`. A schema is a dict that maps `Term`s to their concrete instantiations (either `CategoricalTerm`s or `ContinuousTerm`s. "Hints" may optionally be supplied in the form of a `Dict` mapping term -names (as `Symbol`s) to term or contrast types. If a hint is not provided for a variable, +names (as `Symbol`s) to term or contrast types. If a hint is not provided for a variable, the appropriate term type will be guessed based on the data type from the data column: any numeric data is assumed to be continuous, and any non-numeric data is assumed to be categorical. @@ -91,7 +91,7 @@ StatsModels.Schema with 1 entry: y => y ``` -Note that concrete `ContinuousTerm` and `CategoricalTerm` and un-typed `Term`s print the +Note that concrete `ContinuousTerm` and `CategoricalTerm` and un-typed `Term`s print the same in a container, but when printed alone are different: ```jldoctest 1 @@ -181,6 +181,8 @@ concrete_term(t::Term, x, hint::AbstractTerm) = hint concrete_term(t, d, hint) = t concrete_term(t::Term, xs::AbstractVector{<:Number}, ::Nothing) = concrete_term(t, xs, ContinuousTerm) +# and for missing values +concrete_term(t::Term, xs::AbstractVector{Union{Missing,T}} where T<:Number, ::Nothing) = concrete_term(t, xs, ContinuousTerm) function concrete_term(t::Term, xs::AbstractVector, ::Type{ContinuousTerm}) μ, σ2 = StatsBase.mean_and_var(xs) min, max = extrema(xs) @@ -201,9 +203,9 @@ end Return a new term that is the result of applying `schema` to term `t` with destination model (type) `Mod`. If `Mod` is omitted, `Nothing` will be used. -When `t` is a `ContinuousTerm` or `CategoricalTerm` already, the term will be returned -unchanged _unless_ a matching term is found in the schema. This allows -selective re-setting of a schema to change the contrast coding or levels of a +When `t` is a `ContinuousTerm` or `CategoricalTerm` already, the term will be returned +unchanged _unless_ a matching term is found in the schema. This allows +selective re-setting of a schema to change the contrast coding or levels of a categorical term, or to change a continuous term to categorical or vice versa. When defining behavior for custom term types, it's best to dispatch on @@ -282,7 +284,7 @@ function apply_schema(t::FormulaTerm, schema::Schema, Mod::Type{<:StatisticalMod end # strategy is: apply schema, then "repair" if necessary (promote to full rank -# contrasts). +# contrasts). # # to know whether to repair, need to know context a term appears in. main # effects occur in "own" context. diff --git a/src/terms.jl b/src/terms.jl index d4ec525b..6bad5555 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -483,6 +483,13 @@ modelcols(t::Term, d::NamedTuple) = modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} = ft.fanon.(getfield.(Ref(d), Names)...) +# this is weird, but using import Base: copy leads to exporting type piracy +# for non missing values, the compiler should hopefully optimize down the extra +# layer of indirection +function copy end +copy(x::Any) = Base.copy(x) +copy(m::Missing) = m + modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :] diff --git a/test/terms.jl b/test/terms.jl index 88dc000c..771ccf38 100644 --- a/test/terms.jl +++ b/test/terms.jl @@ -31,6 +31,17 @@ StatsModels.apply_schema(mt::MultiTerm, sch::StatsModels.Schema, Mod::Type) = @test t0.min == 1.0 @test t0.max == 3.0 + vals0m = [3, missing, 1] + t0m = concrete_term(t, vals0m) + @test string(t0m) == "aaa" + @test mimestring(t0m) == "aaa(continuous)" + # compute all these values to make sure the behavior of terms matches + # the behavior of other relevant packages + @test isequal(t0m.mean, mean(vals0m)) + @test isequal(t0m.var, var(vals0m)) + @test isequal(t0m.min, min(vals0m...)) + @test isequal(t0m.max, max(vals0m...)) + t1 = concrete_term(t, [:a, :b, :c]) @test t1.contrasts isa StatsModels.ContrastsMatrix{DummyCoding} @test string(t1) == "aaa"