R2-01-第二期-2 R读取pubmed存入mysql数据库

木萱小主 2018-01-24 20:34:07 阅读: 1683

任务1

完成mysql安装,并调试。尝试导入数据

con <- dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
t_demo<-data.frame(a=seq(1:10), b=letters[1:10], c=rnorm(10))
t_demo
dbWriteTable(con, "t_demo", t_demo)

1.jpg

任务2

#任务2.获得cell杂志2017年所有文章的id
install.packages("RISmed")
library(RISmed)
cell2017<-EUtilsSummary("cell[TA] AND 2017[DP]")
data<-QueryId(cell2017)
data #获得全部的ID
pmids<-paste(data,sep = "",collapse=","),字符串用分号间隔

任务3 #任务3:通过efetch接口获取title和abstract
library(RMySQL)
library(xml2)
library(httr)
postFetchUrl<-'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'
r2 <- POST(postFetchUrl,body = list(db='pubmed',id=pmids,retmode='xml'))
stop_for_status(r2)
data2=content(r2, "parsed")
article=xml_children(data2)
count=length(article)
cnt=1
while(cnt<=count){title=xml_find_first(article[cnt],".//ArticleTitle")
abstract=xml_find_first(article[cnt],".//AbstractText")
write.table(print(xml_text(title)),file='D:/a.txt',row.names=F,quote=F,append=T)
write.table(print(xml_text(abstract)),file='D:/b.txt',row.names=F,quote=F,append=T)
cnt = cnt + 1
}

先用txt做一个吧,RMySQL没整出来,有空再接着做

a<-read.csv(file.choose(),header=T)
b<-read.csv(file.choose(),header=T)
c<-data
article<-data.frame(pmid=c,title=a,abstract=b)
con<-dbConnect(MySQL(),host="localhost",dbname="rdb",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')
dbWriteTable(con, "article2", article) #先弄一个article2 以后用RMySQL再弄article
2.jpg

 重建一个数据库“”rdb2“”

con<-dbConnect(MySQL(),host="localhost",dbname="rdb2",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')
输出:<MySQLResult:2,2,0>

 清理所有的连接

killDbConnections()

输出:[[1]]<MySQLConnection:0,0>

[[2]]
<MySQLConnection:0,1>

[[3]]
<MySQLConnection:0,2>

[1] "3  connections killed."
Warning messages:
1: Closing open result sets
2: Closing open result sets

任务2

con<-dbConnect(MySQL(),host="localhost",dbname="rdb2",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')
library(httr)
totalNum=562
pageSize=10
 totalPage=ceiling(totalNum/pageSize)
currentPage=1
term='(cell[TA]) AND 2017[DP]'
Url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
while(currentPage<=totalPage){
  retstart=(currentPage-1)*pageSize
  r <- POST(Url,
            body = list(
              db='pubmed',
              term=term,
              retmode='json',
              retstart=retstart,
              retmax=pageSize,
              usehistory=usehistory,
              rettype='uilist'
            )
  )
   
  stop_for_status(r)
  data=content(r, "parsed", "application/json")
  esearchresult=data$esearchresult
  idlist =esearchresult$idlist
  n = length(idlist)
  pmid=c()
  i = 1
  while(i<=n){
    pmid=c(pmid, as.character(idlist[i][1]))
    i = i+1
  }
   article=data.frame('pmid'=pmid)
   dbWriteTable(con,"article",article,append=TRUE)
   currentPage = currentPage + 1
}
#close

dbDisconnect(con)

火狐截图_2018-03-01T11-43-52_211Z.png

任务3

library(xml2)
con <- dbConnect(MySQL(),host="localhost",dbname="rdb2",user="root",password="")
dbSendQuery(con,'SET NAMES utf8')
nb<- dbSendQuery(con, "SELECT * FROM article WHERE isdone=0")
()()()while (!dbHasCompleted(nb)) {
  chunk <- dbFetch(nb, 10)
  pmidStr=""
  i=1
  n=nrow(chunk) #获得总行数
  while (i<=n){
    pmidStr = paste(pmidStr,chunk[i,3],sep=",") #循环将各个pmid之间用逗号连接起来
    i = i + 1
  }
  pmidStr=substr(pmidStr,2,100000) #去掉pmid第一个逗号,从第2位起,到100000位,即到末尾()()()()()()()()()()()()()))))

???貌似更容易些  pmids<-paste(data,sep = "",collapse=","),字符串用分号间隔



  #下面就是第一次作业里面获取title和abstract
 Url='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
  r2 <- POST(Url,
             body = list(
               db='pubmed',
               id=pmidStr,
               retmode='xml'
             )
  )
  stop_for_status(r2)
  data2=content(r2, "parsed", "application/xml")
  article=xml_children(data2)
  count=length(article)
  cnt=1
  while(cnt<=count){
    title=xml_text(xml_find_first(article[cnt],".//ArticleTitle"))
    abstract=xml_text(xml_find_first(article[cnt],".//AbstractText"))
    pmid=xml_text(xml_find_first(article[cnt],".//PMID"))
    title = gsub("'","",title)
    abstract = gsub("'","",abstract)
    sql=paste("UPDATE article SET title='",title,"',abstract='",abstract,"',isdone=1"," where pmid='",pmid,"'",sep="")

    con2 <- dbConnect(MySQL(),host="localhost",dbname="rdb2",user="root",password="")
    dbSendQuery(con2,'SET NAMES utf8')
    dbSendQuery(con2,sql)
    dbDisconnect(con2)
    cnt = cnt + 1
    Sys.sleep(1)
  }
}

火狐截图_2018-03-01T11-54-51_834Z.png 

 
邀请讨论

附件

{{f.title}} 大小 {{f.file_size}} 下载 {{f.count_download}} 金币 {{f.count_gold}}
{{item.nick_name}} 受邀请回答 {{item.create_time}}
{{item.refer_comment.nick_name}} {{item.refer_comment.create_time}}

附件

{{f.title}} 大小 {{f.file_size}} 下载 {{f.count_download}} 金币 {{f.count_gold}}
切换到完整回复 发送回复
赞({{item.count_zan}}) 踩({{item.count_cai}}) 删除 回复 关闭
科研狗©2015-2024 科研好助手,京ICP备20005780号-1 建议意见

服务热线

178 0020 3020

微信服务号