| Title: | Transfer Learning for Generalized Factor Models |
|---|---|
| Description: | Transfer learning for generalized factor models with support for continuous, count (Poisson), and binary data types. The package provides functions for single and multiple source transfer learning, source detection to identify positive and negative transfer sources, factor decomposition using Maximum Likelihood Estimation (MLE), and information criteria ('IC1' and 'IC2') for rank selection. The methods are particularly useful for high-dimensional data analysis where auxiliary information from related source datasets can improve estimation efficiency in the target domain. |
| Authors: | Zhijing Wang [aut, cre], Peirong Xu [aut], Hongyu Zhao [aut], Tao Wang [aut] |
| Maintainer: | Zhijing Wang <[email protected]> |
| License: | GPL-3 |
| Version: | 1.0.2 |
| Built: | 2026-06-08 08:10:32 UTC |
| Source: | https://github.com/zjwangatsu/transgfm |
Information criterion (IC1/IC2) for selecting number of factors
ic_criterion( X, r_max = 10, ic_type = c("IC1", "IC2"), data_type = "count", C = NULL, max_iter = 30, verbose = FALSE )ic_criterion( X, r_max = 10, ic_type = c("IC1", "IC2"), data_type = "count", C = NULL, max_iter = 30, verbose = FALSE )
X |
Data matrix (may contain missing values coded as NA) |
r_max |
Maximum number of factors to consider (default: 10) |
ic_type |
IC criterion type: "IC1" or "IC2" (default: "IC1") |
data_type |
Type of data: "continuous", "count", or "binary" |
C |
CJMLE projection constant (if NULL, auto-calculated) |
max_iter |
Maximum CJMLE iterations (default: 30) |
verbose |
Print progress information (default: FALSE) |
List with r_hat (optimal rank), ic_values, loglik_values
# Generate Poisson data with known rank set.seed(2025) n <- 100; p <- 100; r_true <- 2 # Generate true factors F_true <- matrix(runif(n * r_true, min = -2, max = 2), n, r_true) B_true <- matrix(runif(p * r_true, min = -2, max = 2), p, r_true) M_true <- F_true %*% t(B_true) # Generate Poisson observations lambda <- exp(M_true) X <- matrix(rpois(n * p, as.vector(lambda)), n, p) # Add 10% missing values n_missing <- floor(n * p * 0.1) missing_idx <- sample(n * p, n_missing) X[missing_idx] <- NA # Use IC1 to select rank result_IC1 <- ic_criterion( X = X, r_max = 6, ic_type = "IC1", data_type = "count", verbose = TRUE ) print(paste("True rank:", r_true)) print(paste("Estimated rank (IC1):", result_IC1$r_hat)) # Use IC2 to select rank result_IC2 <- ic_criterion( X = X, r_max = 6, ic_type = "IC2", data_type = "count", verbose = TRUE )# Generate Poisson data with known rank set.seed(2025) n <- 100; p <- 100; r_true <- 2 # Generate true factors F_true <- matrix(runif(n * r_true, min = -2, max = 2), n, r_true) B_true <- matrix(runif(p * r_true, min = -2, max = 2), p, r_true) M_true <- F_true %*% t(B_true) # Generate Poisson observations lambda <- exp(M_true) X <- matrix(rpois(n * p, as.vector(lambda)), n, p) # Add 10% missing values n_missing <- floor(n * p * 0.1) missing_idx <- sample(n * p, n_missing) X[missing_idx] <- NA # Use IC1 to select rank result_IC1 <- ic_criterion( X = X, r_max = 6, ic_type = "IC1", data_type = "count", verbose = TRUE ) print(paste("True rank:", r_true)) print(paste("Estimated rank (IC1):", result_IC1$r_hat)) # Use IC2 to select rank result_IC2 <- ic_criterion( X = X, r_max = 6, ic_type = "IC2", data_type = "count", verbose = TRUE )
Identify factor decomposition via SVD
identify(M, r)identify(M, r)
M |
Matrix to decompose |
r |
Number of factors |
List with F (row factors) and B (column factors)
# Generate Poisson data set.seed(123) n0 <- 50; p0 <- 50; r <- 2 F_true <- matrix(runif(n0 * r, min = -2, max = 2), n0, r) B_true <- matrix(runif(p0 * r, min = -2, max = 2), p0, r) F_true <- F_true / sqrt(r) B_true <- B_true / sqrt(r) M_true <- F_true %*% t(B_true) # Decompose using identify result <- identify(M_true, r = 2) F_hat <- result$F B_hat <- result$B # Check reconstruction M_reconstructed <- F_hat %*% t(B_hat) print(max(abs(M_reconstructed - M_true))) # Should be very small# Generate Poisson data set.seed(123) n0 <- 50; p0 <- 50; r <- 2 F_true <- matrix(runif(n0 * r, min = -2, max = 2), n0, r) B_true <- matrix(runif(p0 * r, min = -2, max = 2), p0, r) F_true <- F_true / sqrt(r) B_true <- B_true / sqrt(r) M_true <- F_true %*% t(B_true) # Decompose using identify result <- identify(M_true, r = 2) F_hat <- result$F B_hat <- result$B # Check reconstruction M_reconstructed <- F_hat %*% t(B_hat) print(max(abs(M_reconstructed - M_true))) # Should be very small
Calculate relative error between estimated and true matrices
relative_error(M_hat, M_true)relative_error(M_hat, M_true)
M_hat |
Estimated matrix |
M_true |
True matrix |
Relative Frobenius norm error
M_true <- matrix(1:9, 3, 3) M_hat <- M_true + matrix(rnorm(9, 0, 0.1), 3, 3) relative_error(M_hat, M_true)M_true <- matrix(1:9, 3, 3) M_hat <- M_true + matrix(rnorm(9, 0, 0.1), 3, 3) relative_error(M_hat, M_true)
Detect positive and negative transfer sources using ratio method
source_detection( X_sources, X0, r, C, C2, data_type = "count", c_penalty = 0.1, verbose = TRUE )source_detection( X_sources, X0, r, C, C2, data_type = "count", c_penalty = 0.1, verbose = TRUE )
X_sources |
List of source data matrices (may contain missing values) |
X0 |
Target data matrix (complete) |
r |
Number of factors |
C |
CJMLE projection constant |
C2 |
Refinement projection constant |
data_type |
Type of data: "continuous", "count", or "binary" |
c_penalty |
Penalty coefficient (default: 0.1) |
verbose |
Print progress information (default: TRUE) |
List with positive_sources, negative_sources, and diagnostic info
Identify potential sources based on rank comparison using IC criterion
source_potential( X_sources, X0, r_max = 10, ic_type = "IC1", data_type = "count", C = NULL, max_iter = 30, verbose = TRUE )source_potential( X_sources, X0, r_max = 10, ic_type = "IC1", data_type = "count", C = NULL, max_iter = 30, verbose = TRUE )
X_sources |
List of source data matrices (may contain missing values) |
X0 |
Target data matrix (may contain missing values) |
r_max |
Maximum number of factors to consider (default: 10) |
ic_type |
IC criterion type: "IC1" or "IC2" (default: "IC1") |
data_type |
Type of data: "continuous", "count", or "binary" |
C |
CJMLE projection constant (if NULL, auto-calculated) |
max_iter |
Maximum CJMLE iterations (default: 30) |
verbose |
Print progress information (default: TRUE) |
List with positive_potential_sources, negative_sources, r_target, r_sources
# Generate Poisson data set.seed(2025) # Generate 5 sources with different ranks n1 <- 100; p1 <- 100 source_list <- list() # Sources 1-2: rank 2 (same as target) r_s <- 2 F_s <- matrix(runif(n1 * r_s, min = -2, max = 2), n1, r_s) B_s <- matrix(runif(p1 * r_s, min = -2, max = 2), p1, r_s) M_s <- F_s %*% t(B_s) for (s in 1:2) { X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1) # Add 10% missing values n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_s[missing_idx] <- NA source_list[[s]] <- X_s } # Sources 3-5: rank 3 (different from target) for (s in 3:5) { r_s_nega <- 3 F_s_nega <- matrix(runif(n1 * r_s_nega, min = -2, max = 2), n1, r_s_nega) B_s_nega <- matrix(runif(p1 * r_s_nega, min = -2, max = 2), p1, r_s_nega) M_s_nega <- F_s_nega %*% t(B_s_nega) X_s_nega <- matrix(rpois(n1 * p1, exp(M_s_nega)), n1, p1) n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_s_nega[missing_idx] <- NA source_list[[s]] <- X_s_nega } # Target data: rank 2 n0 <- 50; p0 <- 50; r_target <- 2 M_target <- M_s[1:n0, 1:p0] X_target <- matrix(rpois(n0 * p0, exp(M_target)), n0, p0) # Identify potential sources result <- source_potential( X_sources = source_list, X0 = X_target, r_max = 5, ic_type = "IC1", data_type = "count", verbose = TRUE ) print(result$positive_potential_sources) # Should be c(1, 2) print(result$negative_sources) # Should be c(3, 4, 5) print(result$r_target) # Should be 2 print(result$r_sources) # Should be c(2, 2, 3, 3, 3)# Generate Poisson data set.seed(2025) # Generate 5 sources with different ranks n1 <- 100; p1 <- 100 source_list <- list() # Sources 1-2: rank 2 (same as target) r_s <- 2 F_s <- matrix(runif(n1 * r_s, min = -2, max = 2), n1, r_s) B_s <- matrix(runif(p1 * r_s, min = -2, max = 2), p1, r_s) M_s <- F_s %*% t(B_s) for (s in 1:2) { X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1) # Add 10% missing values n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_s[missing_idx] <- NA source_list[[s]] <- X_s } # Sources 3-5: rank 3 (different from target) for (s in 3:5) { r_s_nega <- 3 F_s_nega <- matrix(runif(n1 * r_s_nega, min = -2, max = 2), n1, r_s_nega) B_s_nega <- matrix(runif(p1 * r_s_nega, min = -2, max = 2), p1, r_s_nega) M_s_nega <- F_s_nega %*% t(B_s_nega) X_s_nega <- matrix(rpois(n1 * p1, exp(M_s_nega)), n1, p1) n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_s_nega[missing_idx] <- NA source_list[[s]] <- X_s_nega } # Target data: rank 2 n0 <- 50; p0 <- 50; r_target <- 2 M_target <- M_s[1:n0, 1:p0] X_target <- matrix(rpois(n0 * p0, exp(M_target)), n0, p0) # Identify potential sources result <- source_potential( X_sources = source_list, X0 = X_target, r_max = 5, ic_type = "IC1", data_type = "count", verbose = TRUE ) print(result$positive_potential_sources) # Should be c(1, 2) print(result$negative_sources) # Should be c(3, 4, 5) print(result$r_target) # Should be 2 print(result$r_sources) # Should be c(2, 2, 3, 3, 3)
Single source transfer learning for generalized factor models
transGFM( source_data, target_data, r, data_type = "count", lambda_seq = seq(0, 10, by = 1), K_cv = 3, sigma2 = 1, max_iter_cjmle = 30, max_iter_refine = 30, max_iter_nuclear = 30, verbose = FALSE )transGFM( source_data, target_data, r, data_type = "count", lambda_seq = seq(0, 10, by = 1), K_cv = 3, sigma2 = 1, max_iter_cjmle = 30, max_iter_refine = 30, max_iter_nuclear = 30, verbose = FALSE )
source_data |
Source data matrix (may contain missing values coded as NA) |
target_data |
Target data matrix (complete) |
r |
Number of factors |
data_type |
Type of data: "continuous", "count", or "binary" |
lambda_seq |
Sequence of lambda values for CV (default: seq(0, 10, by = 1)) |
K_cv |
Number of CV folds (default: 3) |
sigma2 |
Variance parameter for continuous data (default: 1) |
max_iter_cjmle |
Maximum iterations for CJMLE (default: 30) |
max_iter_refine |
Maximum iterations for refinement (default: 30) |
max_iter_nuclear |
Maximum iterations for nuclear MLE (default: 100) |
verbose |
Print progress information (default: FALSE) |
List containing final estimate M_trans and intermediate results
# Generate Poisson data set.seed(2025) # Source data (100 x 100 with 10% missing) n1 <- 100; p1 <- 100; r <- 2 F_source <- matrix(runif(n1 * r, min = -2, max = 2), n1, r) B_source <- matrix(runif(p1 * r, min = -2, max = 2), p1, r) M_source <- F_source %*% t(B_source) lambda_source <- exp(M_source) X_source <- matrix(rpois(n1 * p1, as.vector(lambda_source)), n1, p1) # Add 10% missing values to source n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_source[missing_idx] <- NA # Target data (50 x 50, complete) n0 <- 50; p0 <- 50 M_target_true <- M_source[1:n0, 1:p0] lambda_target <- exp(M_target_true) X_target <- matrix(rpois(n0 * p0, as.vector(lambda_target)), n0, p0) # Run transGFM result <- transGFM( source_data = X_source, target_data = X_target, r = 2, data_type = "count", lambda_seq = seq(0, 5, by = 1), K_cv = 3, verbose = FALSE ) # Check results print(paste("Optimal lambda:", result$optimal_lambda)) print(paste("Final relative error:", relative_error(result$M_trans, M_target_true)))# Generate Poisson data set.seed(2025) # Source data (100 x 100 with 10% missing) n1 <- 100; p1 <- 100; r <- 2 F_source <- matrix(runif(n1 * r, min = -2, max = 2), n1, r) B_source <- matrix(runif(p1 * r, min = -2, max = 2), p1, r) M_source <- F_source %*% t(B_source) lambda_source <- exp(M_source) X_source <- matrix(rpois(n1 * p1, as.vector(lambda_source)), n1, p1) # Add 10% missing values to source n_missing <- floor(n1 * p1 * 0.1) missing_idx <- sample(n1 * p1, n_missing) X_source[missing_idx] <- NA # Target data (50 x 50, complete) n0 <- 50; p0 <- 50 M_target_true <- M_source[1:n0, 1:p0] lambda_target <- exp(M_target_true) X_target <- matrix(rpois(n0 * p0, as.vector(lambda_target)), n0, p0) # Run transGFM result <- transGFM( source_data = X_source, target_data = X_target, r = 2, data_type = "count", lambda_seq = seq(0, 5, by = 1), K_cv = 3, verbose = FALSE ) # Check results print(paste("Optimal lambda:", result$optimal_lambda)) print(paste("Final relative error:", relative_error(result$M_trans, M_target_true)))
Multiple source transfer learning for generalized factor models
transGFM_multi( source_data_list, target_data, r, data_type = "count", method = "AD", lambda_seq = seq(0, 10, by = 1), K_cv = 3, sigma2 = 1, max_iter_cjmle = 30, max_iter_refine = 30, max_iter_nuclear = 100, verbose = FALSE )transGFM_multi( source_data_list, target_data, r, data_type = "count", method = "AD", lambda_seq = seq(0, 10, by = 1), K_cv = 3, sigma2 = 1, max_iter_cjmle = 30, max_iter_refine = 30, max_iter_nuclear = 100, verbose = FALSE )
source_data_list |
List of source data matrices (may contain missing values) |
target_data |
Target data matrix (complete) |
r |
Number of factors |
data_type |
Type of data: "continuous", "count", or "binary" |
method |
Fusion method: "AD" (Average-Debias) or "DA" (Debias-Average) |
lambda_seq |
Sequence of lambda values for CV |
K_cv |
Number of CV folds |
sigma2 |
Variance parameter for continuous data |
max_iter_cjmle |
Maximum iterations for CJMLE |
max_iter_refine |
Maximum iterations for refinement |
max_iter_nuclear |
Maximum iterations for nuclear MLE |
verbose |
Print progress information |
List containing final estimate and intermediate results
# Generate Poisson data set.seed(2025) # Generate 3 source datasets (100 x 100 with different missing rates) n1 <- 100; p1 <- 100; r <- 2 source_list <- list() F_s <- matrix(runif(n1 * r, min = -2, max = 2), n1, r) B_s <- matrix(runif(p1 * r, min = -2, max = 2), p1, r) M_s <- F_s %*% t(B_s) for (s in 1:3) { X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1) # Add missing values (10%, 12%, 14% for sources 1-3) missing_rate <- 0.1 + (s - 1) * 0.02 n_missing <- floor(n1 * p1 * missing_rate) missing_idx <- sample(n1 * p1, n_missing) X_s[missing_idx] <- NA source_list[[s]] <- X_s } # Target data (50 x 50, complete) n0 <- 50; p0 <- 50 M_target_true <- M_s[1:n0, 1:p0] X_target <- matrix(rpois(n0 * p0, exp(M_target_true)), n0, p0) # Run transGFM_multi with AD method result_AD <- transGFM_multi( source_data_list = source_list, target_data = X_target, r = 2, data_type = "count", method = "AD", lambda_seq = seq(0, 5, by = 1), K_cv = 3, verbose = FALSE ) # Run transGFM_multi with DA method result_DA <- transGFM_multi( source_data_list = source_list, target_data = X_target, r = 2, data_type = "count", method = "DA", verbose = FALSE ) # Compare results print(paste("AD method error:", relative_error(result_AD$M_trans, M_target_true))) print(paste("DA method error:", relative_error(result_DA$M_trans, M_target_true)))# Generate Poisson data set.seed(2025) # Generate 3 source datasets (100 x 100 with different missing rates) n1 <- 100; p1 <- 100; r <- 2 source_list <- list() F_s <- matrix(runif(n1 * r, min = -2, max = 2), n1, r) B_s <- matrix(runif(p1 * r, min = -2, max = 2), p1, r) M_s <- F_s %*% t(B_s) for (s in 1:3) { X_s <- matrix(rpois(n1 * p1, exp(M_s)), n1, p1) # Add missing values (10%, 12%, 14% for sources 1-3) missing_rate <- 0.1 + (s - 1) * 0.02 n_missing <- floor(n1 * p1 * missing_rate) missing_idx <- sample(n1 * p1, n_missing) X_s[missing_idx] <- NA source_list[[s]] <- X_s } # Target data (50 x 50, complete) n0 <- 50; p0 <- 50 M_target_true <- M_s[1:n0, 1:p0] X_target <- matrix(rpois(n0 * p0, exp(M_target_true)), n0, p0) # Run transGFM_multi with AD method result_AD <- transGFM_multi( source_data_list = source_list, target_data = X_target, r = 2, data_type = "count", method = "AD", lambda_seq = seq(0, 5, by = 1), K_cv = 3, verbose = FALSE ) # Run transGFM_multi with DA method result_DA <- transGFM_multi( source_data_list = source_list, target_data = X_target, r = 2, data_type = "count", method = "DA", verbose = FALSE ) # Compare results print(paste("AD method error:", relative_error(result_AD$M_trans, M_target_true))) print(paste("DA method error:", relative_error(result_DA$M_trans, M_target_true)))