生物医学工程理论与实践/R语言

数据类型

R 拥有多种用于保存数据的对象，包括标量、向量、矩阵、数组、数据框和列表。

标量和常量

"标量" 通常是指 "一维" 向量。常量始终只有一个值。您可以将常量视为零维值（单个点）。

标量

> x<-3
> y<-6
> z<-x+y
> z
[1] 9

常量

> 2+3
[1] 5
> 5-4
[1] 1
> 6*4
[1] 24

向量

向量是一维数组，可以保存数值数据、字符数据或逻辑数据。组合函数 c() 用于形成向量。以下是每种类型向量的示例

> a<-c(1,2,5,-3,-6,5) #nummeric vector
> b<-c("one","two","three") #character vector
> d<-c(TRUE,FALSE,TRUE,FALSE,TRUE,TRUE) #logical vector
> a[c(2,4)]
[1]  2 -3
> a[4]
[1] -3
> a[2:4]
[1]  2  5 -3

矩阵

矩阵是一个二维数组，其中每个元素具有相同的模式（数值、字符或逻辑）。矩阵使用“矩阵”函数创建。一般格式如下

> mymatrix <- matrix(vector, nrow=number of rows, ncol=number of columns,byrows=logical value, dimnames=list(vector-of-rownames,vector-of-colnames))

> A<-matrix(1:9,nrow=3)
> A
     [,1] [,2] [,3]
[1,]    1    4    7
[2,]    2    5    8
[3,]    3    6    9
> A[2,1]
[1] 2

> A<-matrix(1:9,nrow=3,byrow=T)
> A
     [,1] [,2] [,3]
[1,]    1    2    3
[2,]    4    5    6
[3,]    7    8    9

数组

> myarray <-array(vector, dimensions, dimnames)

> dim1 <- c("A1","A2","A3")
> dim2 <- c("B1","B2","B3","B4")
> dim3 <- c("C1","C2")
> x<-array(1:24,c(3,4,2),dimnames=list(dim1,dim2,dim3))
> x
, , C1

   B1 B2 B3 B4
A1  1  4  7 10
A2  2  5  8 11
A3  3  6  9 12

, , C2

   B1 B2 B3 B4
A1 13 16 19 22
A2 14 17 20 23
A3 15 18 21 24

数据框

>mydata <-data.frame(col1,col2,col3....)

> patientID<-LETTERS[1:4]
> age<-c(24,35,28,52)
> diabetes<-c("Type1","Type2","Type1","Type2")
> stats<-c("Poor","Improved","Poor","Excellent")
> patientDATA<-data.frame(patientID,age,diabetes,stats,row.names=letters[1:4])
> patientDATA
  patientID age diabetes     stats
a         A  24    Type1      Poor
b         B  35    Type2  Improved
c         C  28    Type1      Poor
d         D  52    Type2 Excellent

因子

> patientID<-LETTERS[1:4]
> age<-c(24,35,28,52)
> diabetes<-c("Type1","Type2","Type1","Type2")
> stats<-c("Poor","Improved","Poor","Excellent")
> status <- factor(stats, order=TRUE)
> patientdata <- data.frame(patientID, age, diabetes, status)
> str(patientdata)
'data.frame':	4 obs. of  4 variables:
 $ patientID: Factor w/ 4 levels "A","B","C","D": 1 2 3 4
 $ age      : num  24 35 28 52
 $ diabetes : Factor w/ 2 levels "Type1","Type2": 1 2 1 2
 $ status   : Ord.factor w/ 3 levels "Excellent"<"Improved"<..: 3 2 3 1
> summary(patientdata)
 patientID      age         diabetes       status 
 A:1       Min.   :24.00   Type1:2   Excellent:1  
 B:1       1st Qu.:27.00   Type2:2   Improved :1  
 C:1       Median :31.50             Poor     :2  
 D:1       Mean   :34.75                          
           3rd Qu.:39.25                          
           Max.   :52.00

列表

>mylist <- list(name1=object1,name2=object2,...)

> x<-"TheList"
> y<-c(25,19,20)
> z<-matrix(1:10,nrow=2,byrow=TRUE)
> theta<-LETTERS[1:10]
> delta<-c(2+3i,4-6i)
> mylist<-list(title=x,components=y,z,theta,delta)
> mylist
$title
[1] "TheList"

$components
[1] 25 19 20

[[3]]
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    6    7    8    9   10

[[4]]
 [1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J"

[[5]]
[1] 2+3i 4-6i

> mylist[[2]]
[1] 25 19 20
> mylist[["components"]]
[1] 25 19 20

> lapply(mylist,length)
$title
[1] 1

$components
[1] 3

[[3]]
[1] 10

[[4]]
[1] 10

[[5]]
[1] 2

> lapply(mylist,class)
$title
[1] "character"

$components
[1] "numeric"

[[3]]
[1] "matrix"

[[4]]
[1] "character"

[[5]]
[1] "complex"

> lapply(mylist,mean)
$title
[1] NA

$components
[1] 21.33333

[[3]]
[1] 5.5

[[4]]
[1] NA

[[5]]
[1] 3-1.5i

Warning messages:
1: In mean.default(X[[1L]], ...) :
  argument is not numeric or logical: returning NA
2: In mean.default(X[[4L]], ...) :

基本函数

算术运算符

下表列出了 R 编程中使用的算术运算符及其示例。

函数	R 命令	示例
幂运算， $a^{n}$	> a^b	> 3+6 [1] 9
乘法， $a\times b$	> a*b	> 22*5 [1] 110
除法， $a\div b$	> a/b	> 30/3 [1] 10
加法， $a+b$	> a+b	> 10+9 [1] 19
减法， $a-b$	> a-b	> 10-3 [1] 7
整数（商）	> a%/%b	> 20%/%3 [1] 6
模运算（余数）	> a%%b	> 20%%3 [1] 2

复数

> x<-5.2-3i

	R 命令		R 命令
复数	> Re(x) [1] 5.2	实部	> Im(x) [1] -3
虚部	> Im(x) [1] -3	模数	> Mod(x) [1] 6.003332
幅角	> Arg(x) [1] -0.5232783	共轭	> Conj(x) [1] 5.2+3i
成员资格	> is.complex(x) [1] TRUE	强制类型转换	> as.complex(19.6) [1] 19.6+0i

舍入

函数	R 命令	函数	R 命令
小于给定数的最大整数	> floor(9.9) [1] 9 > floor(-9.9) [1] -10	下一个整数	> ceiling(9.9) [1] 10 > ceiling(-9.9) [1] -9
舍入函数	> round(9.9) [1] 10 > round(9.2) [1] 9	去掉小数部分	> trunc(8.6) [1] 8 > trunc(-8.6) [1] -8

三角函数

函数	三角函数	三角函数的反函数	双曲函数	双曲函数的反函数
正弦	sin(x)	asin(x)	sinh(x)	asinh(x)
余弦	cos(x)	acos(x)	cosh(x)	acosh(x)
正切	tan(x)	atan(x)	tanh(x)	atanh(x)

对数和指数函数

函数	R 命令	R 例子
绝对值， $\|x\|$	abs(x)	> abs(-7.4) [1] 7.4
以 e 为底的对数， $\log _{e}(x)$	> log(10) [1] 2.302585
以 10 为底的对数， $\log _{10}(x)$	log10(x)	> log10(100) [1] 2
以 n 为底的 x 的对数	log(x,n)	> log(64,4) [1] 3
$e^{x}$	exp(x)	> exp(3) [1] 20.08554
${\sqrt {x}}$	sqrt(x)	> sqrt(25) [1] 5
$n!$	factorial(x)	> factorial(10) [1] 3628800
${\frac {n!}{r!(n-r)!}}$	combinations(n,r)	> choose(5,4) [1] 5

关系运算符和逻辑变量

关系运算符

关系运算符
等于	==
不等于	!=
小于	<
大于	>
小于或等于	<=
大于或等于	>=

TRUE=1,FALSE=0

> x<-c(6,3,4)
> y<-c(5,15,9)
> z<-(x<y)
> z
[1] FALSE  TRUE  TRUE
> z<-(x<y)+5
> z
[1] 5 6 6

逻辑运算符

$A$	$B$	$!A$	$A$ & $B$	$A\|B$	$xor(A,B)$
False(0)	False(0)	True(1)	False(0)	False(0)	False(0)
False(0)	True(1)	True(1)	False(0)	True(1)	True(1)
True(1)	False(0)	False(0)	False(0)	True(1)	True(1)
True(1)	True(1)	False(0)	True(1)	True(1)	False(0)

> x<-c(6,2,8)
> y<-c(14,6,7)
> z<-c(4,5,11)
> z1<-x>y
> z1
[1] FALSE FALSE  TRUE
> z2<-y>z
> z2
[1]  TRUE  TRUE FALSE
> z3<-(x>y) & (y>z)
> z3
[1] FALSE FALSE FALSE

> z1<-xor(x,y) > z1 [1] FALSE FALSE FALSE

序列生成和重复

> x1
[1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0
> x2 <- seq(from=0.4,by=0.01,length=15)
> x2
 [1] 0.40 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.50 0.51 0.52 0.53 0.54
> x3<-seq(1.4,2.1,0.3)
> x3
[1] 1.4 1.7 2.0
> x4<-rep(15,7)
> x4
[1] 15 15 15 15 15 15 15
> x5<-rep(1:4,3)
> x5
 [1] 1 2 3 4 1 2 3 4 1 2 3 4
> x6<-rep(1:3,each=2,times=3)
> x6
 [1] 1 1 2 2 3 3 1 1 2 2 3 3 1 1 2 2 3 3
> x7<-rep(c("a","b","c"),c(1,2,3))
> x7
[1] "a" "b" "b" "c" "c" "c"

随机数生成

> set.seed(100)
> runif(5)
[1] 0.5465586 0.1702621 0.6249965 0.8821655 0.2803538
> runif(5)
[1] 0.3984879 0.7625511 0.6690217 0.2046122 0.3575249

> x<-c(5,10,8,6,9,11,14,16,18)
> sample(x)
[1]  6 11 18  9  8 16 10  5 14
> sample(x)
[1] 16  9 10  8 18 14 11  6  5
> sample(x,4)
[1] 10 11 14  5

向量函数

长度和统计

> x<-c(6,9,11,14,12,2,33,76,0,90)

函数	R 命令	函数	R 命令
长度	> length(x) [1] 10	平均值	> mean(x) [1] 25.3
最大值	> max(x) [1] 90	最小值	> min(x) [1] 0
分布	> quantile(x) 0% 25% 50% 75% 100% 0.00 6.75 11.50 28.25 90.00	排序	> sort(x) [1] 0 2 6 9 11 12 14 33 76 90

函数	R 命令
引用向量中的第 5 个元素	> x[5] [1] 12
从向量中删除第 3 个元素	> x1<-x[-3] > x1 [1] 6 9 14 12 2 33 76 0 90
从向量中删除最后一个元素	> x2<-x[-length(x)] > x2 [1] 6 9 11 14 12 2 33 76 0
从向量中删除第一个和最后一个元素	> x3<-x[c(-1,-length(x))] > x3 [1] 9 11 14 12 2 33 76 0
从向量中删除最小的 2 个元素和最大的 3 个元素	> trim <-function(x)sort(x)[-c(1,2,length(x)-2,length(x)-1,length(x))] > trim(x) [1] 6 9 11 12 14

	R 代码
求和	> sum(x) [1] 253
平均值，中位数	> mean(x) [1] 25.3	> median(x) [1] 11.5
范围	> range(x) [1] 0 90
标准差，方差	> sd(x) [1] 31.87841	> var(x) [1] 1016.233
找出最大和最小值	> which(x==max(x)) [1] 10	> which(x==min(x)) [1] 9
排序和逆序排序	> sort(x) [1] 0 2 6 9 11 12 14 33 76 90	> rev(sort(x)) [1] 90 76 33 14 12 11 9 6 2 0

> x<-matrix(rpois(15,1.2),nrow=3)
> x
     [,1] [,2] [,3] [,4] [,5]
[1,]    2    1    0    3    3
[2,]    0    2    3    1    2
[3,]    2    2    0    1    1
> mean(x[,5])
[1] 2
> var(x[3,])
[1] 0.7
> rowSums(x)
[1] 9 8 6
> colSums(x)
[1] 4 5 3 5 6
> rowMeans(x)
[1] 1.8 1.6 1.2
> colMeans(x)
[1] 1.333333 1.666667 1.000000 1.666667 2.000000

并行最小值和最大值

> x<-c(2,5,10,-6,29,45)
> y<-c(5,9,15,-22,38,88)
> z<-c(9,10,2,7,55,24)
> q<-c(22,3,5,6,-23,88)
> pmin(x,y,z,q)
[1]   2   3   2 -22 -23  24
> pmax(x,y,z,q)
[1] 22 10 15  7 55 88

'table' 和 'tapply'

> data(ChickWeight)
weight Time Chick Diet
1     42    0     1    1
2     51    2     1    1
3     59    4     1    1
4     64    6     1    1
5     76    8     1    1
6     93   10     1    1
.......................
576    234   18    50    4
577    264   20    50    4
578    264   21    50    4
> tapply(ChickWeight$weight,ChickWeight$Time,mean)
        0         2         4         6         8        10        12        14 
 41.06000  49.22000  59.95918  74.30612  91.24490 107.83673 129.24490 143.81250 
       16        18        20        21 
168.08511 190.19149 209.71739 218.68889 
> tapply(ChickWeight$weight,ChickWeight$Diet,median)
    1     2     3     4 
 88.0 104.5 125.5 129.5

> codon1=c("UUU","UUC","UUA","UUG","UUA","UUG","UUC")
> table(codon1)
codon1
UUA UUC UUG UUU 
  2   2   2   1 
> aminoacid=list(Phe=c("UUU","UUC"),Leu=c("UUA","UUG"))
> codon=as.factor(codon1)
> levels(codon)=aminoacid
> codon
[1] Phe Phe Leu Leu Leu Leu Phe
Levels: Phe Leu
> table(codon)
codon
Phe Leu 
  3   4

'apply'

> x<-matrix(1:15,nrow=3,byrow=T)
> x
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    2    3    4    5
[2,]    6    7    8    9   10
[3,]   11   12   13   14   15
> apply(x,1,sum)
[1] 15 40 65
> apply(x,2,sum)
[1] 18 21 24 27 30
> apply(x,1,sqrt)
         [,1]     [,2]     [,3]
[1,] 1.000000 2.449490 3.316625
[2,] 1.414214 2.645751 3.464102
[3,] 1.732051 2.828427 3.605551
[4,] 2.000000 3.000000 3.741657
[5,] 2.236068 3.162278 3.872983
> apply(x,2,sqrt)
         [,1]     [,2]     [,3]     [,4]     [,5]
[1,] 1.000000 1.414214 1.732051 2.000000 2.236068
[2,] 2.449490 2.645751 2.828427 3.000000 3.162278
[3,] 3.316625 3.464102 3.605551 3.741657 3.872983

排序、排名、顺序

> x<-c(2,5,10,-6,29,45)
> # rank: the rank of unsorted vector
> rank(x)
[1] 2 3 4 1 5 6
> # order:the rank of the sorted vector
> order(x)
[1] 4 1 2 3 5 6

唯一值和重复值

> x<-c("a","b","c","a","a","a","b","c")
> table(x)
x
a b c 
4 2 2 
> unique(x)
[1] "a" "b" "c"
> duplicated(x)
[1] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
> x[!duplicated(x)]
[1] "a" "b" "c"

运行长度

> x<-rpois(20,0.5)
> x
 [1] 2 0 0 1 0 1 0 0 1 1 0 0 2 0 0 0 0 0 0 0
> rle(x)
Run Length Encoding
  lengths: int [1:10] 1 2 1 1 1 2 2 2 1 7
  values : int [1:10] 2 0 1 0 1 0 1 0 2 0

集合函数

> setA <-c("I","II","III","IV","V")
> setB <-c("III","IV","V","VI")
> union(setA,setB)
[1] "I"   "II"  "III" "IV"  "V"   "VI" 
> intersect(setA,setB)
[1] "III" "IV"  "V"  
> setdiff(setA,setB)
[1] "I"  "II"
> setdiff(setB,setA)
[1] "VI"

练习

参考文献