Original package using to generate new package

library(dataone)
library(datapack)

d1c <- D1Client("STAGING", "urn:node:mnTestARCTIC")
origPkg <- getDataPackage(d1c, identifier = "urn:uuid:5cfda337-4d6b-49a1-ac64-c6bc62d8d439", lazyLoad=TRUE, limit="0MB", quiet=FALSE)
## Downloading package members for package with metadata identifier: urn:uuid:5cfda337-4d6b-49a1-ac64-c6bc62d8d439
## Downloaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:5cfda337-4d6b-49a1-ac64-c6bc62d8d439
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:9d38badf-96c4-4f23-92be-1d55ab2c2556
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:99f677ec-c020-4e9d-8526-635feb02792e
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:173e1255-4102-4aff-9992-15052b20f755
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:33a0d503-8bf0-4dd6-94e6-a856fb858c50
## Getting resource map with id: resource_map_urn:uuid:5cfda337-4d6b-49a1-ac64-c6bc62d8d439

Bundling new package

Create metadata in eml
source("../createEml.R")
Create a package
dp <- new("DataPackage")
Add members
devtools::install_github("nceas/arcticdatautils")
## Downloading GitHub repo nceas/arcticdatautils@master
## from URL https://api.github.com/repos/nceas/arcticdatautils/zipball/master
## Installing arcticdatautils
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore --quiet CMD INSTALL  \
##   '/private/var/folders/7t/xdmtt00j0w53slb206w9xh100000gn/T/RtmplWeInQ/devtoolsf9ff78aff70c/NCEAS-arcticdatautils-ed26220'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library'  \
##   --install-tests
## 
library(arcticdatautils)

metadataObj <- new("DataObject", format="eml://ecoinformatics.org/eml-2.1.1", filename="./meta.xml")
dp <- addMember(dp, metadataObj)

# Collect lists of files except metadata file
flists <- grep(list.files("."), pattern="\\.xml$", inv=T, value=T)
flists <- grep(flists, pattern="\\.zip$", inv=T, value=T)

# Filter out directory names
flists <- setdiff(flists, list.dirs(recursive = FALSE, full.names = FALSE))

for(o in flists) { 
  obj <- new("DataObject", format=guess_format_id(o), filename=paste("./",o,sep=""))
  dp <- addMember(dp, obj, metadataObj)
}

print(dp)
## Members:
## 
## filename        format    mediaType  size     identifier    modified local 
## new_analysis.R  app...eam NA         376      urn:u...5828a n        y     
## new_mt..._g.png image/png NA         79800    urn:u...bded8 n        y     
## meta.xml        eml...1.1 NA         1790     urn:u...cac85 n        y     
## new_mt...ed.csv text/csv  NA         5401     urn:u...869af n        y     
## 
## Package identifier: NA
## RightsHolder: NA
## 
## 
## Relationships (updated):
## 
##                 subject           predicate                object
## 1              meta.xml      cito:documents        new_analysis.R
## 3              meta.xml      cito:documents      new_mtcars_g.png
## 5              meta.xml      cito:documents new_mtcar...hered.csv
## 2        new_analysis.R cito:isDocumentedBy              meta.xml
## 6 new_mtcar...hered.csv cito:isDocumentedBy              meta.xml
## 4      new_mtcars_g.png cito:isDocumentedBy              meta.xml
# add otherEntity
pids <- NULL
for(i in 1:length(flists)) {
  pid <- selectMember(dp, name="sysmeta@fileName", value=flists[i])
  pids <- c(pids, pid)
  flists[i] <- paste("./", flists[i], sep="")
}
entity_df <- data.frame(type="otherEntity", path=flists, pid=pids, format_id=guess_format_id(flists), stringsAsFactors=FALSE)
eml <- eml_add_entities(eml, entity_df)
write_eml(eml, "./meta.xml")
eml_validate("./meta.xml")
## [1] TRUE
## attr(,"errors")
## character(0)
print(eml)
## <eml packageId="66360f48-9f72-4ae8-a09b-a326f273f90b" system="uuid" xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 eml.xsd">
##   <dataset>
##     <title>test2 storing provenance into resmap</title>
##     <creator>
##       <individualName>
##         <givenName>Seokki</givenName>
##         <surName>Lee</surName>
##       </individualName>
##       <electronicMailAddress>slee195@hawk.iit.edu</electronicMailAddress>
##     </creator>
##     <pubDate>2018-07-13</pubDate>
##     <abstract>test abs</abstract>
##     <coverage>
##       <geographicCoverage>
##         <geographicDescription>Chicago</geographicDescription>
##         <boundingCoordinates>
##           <westBoundingCoordinate>-87.623177</westBoundingCoordinate>
##           <eastBoundingCoordinate>-87.623177</eastBoundingCoordinate>
##           <northBoundingCoordinate>41.881832</northBoundingCoordinate>
##           <southBoundingCoordinate>41.881832</southBoundingCoordinate>
##         </boundingCoordinates>
##       </geographicCoverage>
##       <temporalCoverage>
##         <rangeOfDates>
##           <beginDate>
##             <calendarDate>2018-07-09</calendarDate>
##           </beginDate>
##           <endDate>
##             <calendarDate>2018-07-13</calendarDate>
##           </endDate>
##         </rangeOfDates>
##       </temporalCoverage>
##     </coverage>
##     <contact>
##       <individualName>
##         <givenName>Seokki</givenName>
##         <surName>Lee</surName>
##       </individualName>
##       <organizationName>IIT</organizationName>
##       <phone>000-000-0000</phone>
##       <electronicMailAddress>slee195@hawk.iit.edu</electronicMailAddress>
##     </contact>
##     <otherEntity id="urn:uuid:1752ca89-e597-4b33-b0d0-7fd55af5828a" scope="document">
##       <entityName>new_analysis.R</entityName>
##       <physical scope="document">
##         <objectName>new_analysis.R</objectName>
##         <size unit="bytes">376</size>
##         <authentication method="SHA-1">6db939354e2593c0d83a55c7166dcd2595569fab</authentication>
##         <dataFormat>
##           <externallyDefinedFormat>
##             <formatName>application/octet-stream</formatName>
##           </externallyDefinedFormat>
##         </dataFormat>
##         <distribution scope="document">
##           <online>
##             <url function="download">https://cn.dataone.org/cn/v2/resolve/urn:uuid:1752ca89-e597-4b33-b0d0-7fd55af5828a</url>
##           </online>
##         </distribution>
##       </physical>
##       <entityType>Other</entityType>
##     </otherEntity>
##     <otherEntity id="urn:uuid:93af879f-e121-4891-be99-b49e5f4bded8" scope="document">
##       <entityName>new_mtcars_g.png</entityName>
##       <physical scope="document">
##         <objectName>new_mtcars_g.png</objectName>
##         <size unit="bytes">79800</size>
##         <authentication method="SHA-1">3cd8e638ed05d01f293f6b878788a4950b178e34</authentication>
##         <dataFormat>
##           <externallyDefinedFormat>
##             <formatName>image/png</formatName>
##           </externallyDefinedFormat>
##         </dataFormat>
##         <distribution scope="document">
##           <online>
##             <url function="download">https://cn.dataone.org/cn/v2/resolve/urn:uuid:93af879f-e121-4891-be99-b49e5f4bded8</url>
##           </online>
##         </distribution>
##       </physical>
##       <entityType>Other</entityType>
##     </otherEntity>
##     <otherEntity id="urn:uuid:e9fff181-5862-4b1d-964b-fc5e438869af" scope="document">
##       <entityName>new_mtcars_gathered.csv</entityName>
##       <physical scope="document">
##         <objectName>new_mtcars_gathered.csv</objectName>
##         <size unit="bytes">5401</size>
##         <authentication method="SHA-1">38dd3e4fd9e7aaf9e43ee073f66f36fa34dd264c</authentication>
##         <dataFormat>
##           <externallyDefinedFormat>
##             <formatName>text/csv</formatName>
##           </externallyDefinedFormat>
##         </dataFormat>
##         <distribution scope="document">
##           <online>
##             <url function="download">https://cn.dataone.org/cn/v2/resolve/urn:uuid:e9fff181-5862-4b1d-964b-fc5e438869af</url>
##           </online>
##         </distribution>
##       </physical>
##       <entityType>Other</entityType>
##     </otherEntity>
##   </dataset>
## </eml>
# replace metadata file in the package
dp <- replaceMember(dp, metadataObj, replacement="./meta.xml", formatId="eml://ecoinformatics.org/eml-2.1.1")
print(dp)
## Members:
## 
## filename        format    mediaType  size     identifier    modified local 
## new_analysis.R  app...eam NA         376      urn:u...5828a n        y     
## new_mt..._g.png image/png NA         79800    urn:u...bded8 n        y     
## meta.xml        eml...1.1 NA         4274     urn:u...cac85 y        y     
## new_mt...ed.csv text/csv  NA         5401     urn:u...869af n        y     
## 
## Package identifier: NA
## RightsHolder: NA
## 
## 
## Relationships (updated):
## 
##                 subject           predicate                object
## 1              meta.xml      cito:documents        new_analysis.R
## 3              meta.xml      cito:documents      new_mtcars_g.png
## 5              meta.xml      cito:documents new_mtcar...hered.csv
## 2        new_analysis.R cito:isDocumentedBy              meta.xml
## 6 new_mtcar...hered.csv cito:isDocumentedBy              meta.xml
## 4      new_mtcars_g.png cito:isDocumentedBy              meta.xml
Capture provenance
  • Add information of which object is used for new analysis (e.g., (new_analysis.R, “prov:used”, Alice’s resource map id))
  • Add information of what is the package that the object used belongs to (e.g., (new_analysis.R, “prov:used”, “mtcars.csv”))
devtools::install_github("nceas/recordr")
## Downloading GitHub repo nceas/recordr@master
## from URL https://api.github.com/repos/nceas/recordr/zipball/master
## Installing recordr
## '/Library/Frameworks/R.framework/Resources/bin/R' --no-site-file  \
##   --no-environ --no-save --no-restore --quiet CMD INSTALL  \
##   '/private/var/folders/7t/xdmtt00j0w53slb206w9xh100000gn/T/RtmplWeInQ/devtoolsf9ff502e6736/NCEAS-recordr-553ca40'  \
##   --library='/Library/Frameworks/R.framework/Versions/3.3/Resources/library'  \
##   --install-tests
## 
library(recordr)
rc <- new("Recordr")
## A new recordr home directory has been created at:
## 
##  /var/folders/7t/xdmtt00j0w53slb206w9xh100000gn/T//RtmplWeInQ/recordr
## The recordr package will save run information to this directory, which is under
## the R session temporary directory. Therefore the information that recordr collects
## will be removed by R when the current R session ends.
## 
## If you wish to change the recordr home directory so that information is saved to a
## permanent location please use the "newDir" argument, for example
## 
##  "rc <- new("Recordr", newDir="/Users/bobsmith/recordr")
progFiles <- list.files(".", pattern="\\.R$")

if(length(progFiles) > 0) {
  for(i in progFiles) {
    record(rc, paste("./",i,sep=""), tag=i)
  }
  
  # Add provenance into the package
  for(i in 1:length(progFiles)) {
    # capture file names and access types
    vr <- viewRuns(rc, seq=i)
    vrdf <- setNames(data.frame(basename(vr$files$filePath), vr$files$access), c("names","access"))
    vrUsed <- setNames(subset(vrdf, vrdf$access == "read"), c("names","acess"))
    vrDerived <- setNames(subset(vrdf, vrdf$access == "write"), c("names","acess"))
    
    # capture ids for sources (those files used and read)
    sids <- NULL
    for(n in 1:length(vrUsed$names)) {
      sid <- selectMember(dp, name="sysmeta@fileName", value=as.character(vrUsed$names[n]))
      sids <- c(sids, sid)
      
      # add a connection to Alice's package as a relationship into Bob's package
      # information of the original package that the source data belongs to
      if(!is.element(origPkg@resmapId, sids)) {
        sids <- c(sids, origPkg@resmapId) 
      }
      
      # information of the object in the original package that the Bob's analysis used
      sid <- selectMember(origPkg, name="sysmeta@fileName", value=as.character(vrUsed$names[n]))
      sids <- c(sids, sid)
      #sids <- c(sids, as.character(vrUsed$names[n]))
    }
    
    # capture ids for outputs (those files derived)
    oids <- NULL
    for(o in 1:length(vrDerived$names)) {
      oid <- selectMember(dp, name="sysmeta@fileName", value=as.character(vrDerived$names[o]))
      oids <- c(oids, oid)
    }
    
    # capture id for program script
    pid <- selectMember(dp, name="sysmeta@fileName", value=progFiles[i])
    
    # create provenance relationships
    dp <- describeWorkflow(dp, sources=sids, program=pid, derivations=oids)
  }
  
  print(dp)  
}
## rgdal: version: 1.3-3, (SVN revision 759)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.3.0, released 2018/05/04
##  Path to GDAL shared files: /usr/local/Cellar/gdal/2.3.0/share/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 5.1.0, June 1st, 2018, [PJ_VERSION: 510]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.3-1
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:raster':
## 
##     extract
## [details]: Run details
## ----------------------
## "/Users/eunjungpark/Documents/data...ct1/fullProv/bobPkg/new_analysis.R" was executed on 2018-07-13 12:46:39 CDT
## Tag: "new_analysis.R"
## Run sequence #: 1
## Publish date: Not published
## Published to: NA
## Published Id: NA
## View at: NA
## Run by user: eunjungpark
## Account subject: NA
## Run Id: urn:uuid:777cebdb-854d-4d47-a9c8-a9cb1cec5249
## Data package Id: urn:uuid:5c585589-36f8-4e93-aec8-378a00be122a
## HostId: eunjeong-bag-ui-MacBook-Pro.local
## Operating system: x86_64-apple-darwin13.4.0
## R version: R version 3.3.3 (2017-03-06)
## Dependencies: stats, graphics, grDevices, utils, datasets, methods, base, Rcpp_0.12.17, git2r_0.21.0, plyr_1.8.4, base64enc_0.1-3, tools_3.3.3, digest_0.6.15, uuid_0.1-2, bit_1.1-14, jsonlite_1.5, evaluate_0.10.1, memoise_1.1.0, RSQLite_2.0, pkgconfig_2.0.1, DBI_1.0.0, commonmark_1.5, curl_3.2, yaml_2.1.16, withr_2.1.2, httr_1.3.1, stringr_1.2.0, roxygen2_6.0.1, xml2_1.2.0, knitr_1.18, devtools_1.13.6, rappdirs_0.3.1, rprojroot_1.3-2, bit64_0.9-7, redland_1.0.17-9, parsedate_1.1.3, R6_2.2.2, hash_2.2.6, XML_3.98-1.9, rmarkdown_1.10, blob_1.1.1, magrittr_1.5, backports_1.1.2, htmltools_0.3.6, stringi_1.2.3, openssl_1.0.1, recordr_1.0.3.9000, arcticdatautils_0.6.4, EML_1.0.3, datapack_1.3.1, dataone_2.1.1
## Run start time: 2018-07-13 12:46:39 CDT
## Run end time: 2018-07-13 12:46:45 CDT
## Error message from this run: NA
## 
## [used]: 1 items used by this run
## -----------------------------------
## Location                                                     Size (kb)    Modified time      
## /Users/eunjungpark/Documents.../fullProv/alicePkg/mtcars.csv 1281         2018-07-12 19:26:52
## 
## [generated]: 2 items generated by this run
## -----------------------------------------
## Location                                                     Size (kb)    Modified time      
## /Users/eunjungpark/Documents...obPkg/new_mtcars_gathered.csv 5401         2018-07-13 12:46:43
## /Users/eunjungpark/Documents...lProv/bobPkg/new_mtcars_g.png 79800        2018-07-13 12:39:44
## Members:
## 
## filename        format    mediaType  size     identifier    modified local 
## new_analysis.R  app...eam NA         376      urn:u...5828a n        y     
## new_mt..._g.png image/png NA         79800    urn:u...bded8 n        y     
## meta.xml        eml...1.1 NA         4274     urn:u...cac85 y        y     
## new_mt...ed.csv text/csv  NA         5401     urn:u...869af n        y     
## 
## Package identifier: NA
## RightsHolder: NA
## 
## 
## Relationships (updated):
## 
##                  subject             predicate                object
## 14 _0b4e40fa...9d2acbefd              rdf:type      prov:Association
## 13 _0b4e40fa...9d2acbefd          prov:hadPlan        new_analysis.R
## 1               meta.xml        cito:documents        new_analysis.R
## 3               meta.xml        cito:documents      new_mtcars_g.png
## 5               meta.xml        cito:documents new_mtcar...hered.csv
## 2         new_analysis.R   cito:isDocumentedBy              meta.xml
## 15        new_analysis.R              rdf:type       provone:Program
## 6  new_mtcar...hered.csv   cito:isDocumentedBy              meta.xml
## 9  new_mtcar...hered.csv              rdf:type          provone:Data
## 21 new_mtcar...hered.csv   prov:wasDerivedFrom resource_...c62d8d439
## 22 new_mtcar...hered.csv   prov:wasDerivedFrom urn:uuid:...52b20f755
## 19 new_mtcar...hered.csv   prov:wasGeneratedBy urn:uuid:...2d470bf65
## 4       new_mtcars_g.png   cito:isDocumentedBy              meta.xml
## 10      new_mtcars_g.png              rdf:type          provone:Data
## 23      new_mtcars_g.png   prov:wasDerivedFrom resource_...c62d8d439
## 24      new_mtcars_g.png   prov:wasDerivedFrom urn:uuid:...52b20f755
## 20      new_mtcars_g.png   prov:wasGeneratedBy urn:uuid:...2d470bf65
## 7  resource_...c62d8d439              rdf:type          provone:Data
## 16 urn:uuid:...2d470bf65    dcterms:identifier urn:uuid:...2d470bf65
## 12 urn:uuid:...2d470bf65              rdf:type     provone:Execution
## 11 urn:uuid:...2d470bf65 prov:qual...sociation _0b4e40fa...9d2acbefd
## 17 urn:uuid:...2d470bf65             prov:used resource_...c62d8d439
## 18 urn:uuid:...2d470bf65             prov:used urn:uuid:...52b20f755
## 8  urn:uuid:...52b20f755              rdf:type          provone:Data

Publish package to DataONE Test node

#d1c <- D1Client("STAGING", "urn:node:mnTestARCTIC")
#pkgId <- uploadDataPackage(d1c, dp, public=TRUE, quiet=FALSE)

Retrieve during transparancy

Download package from DataONE
fullProvPkg <- getDataPackage(d1c, identifier = "urn:uuid:7255080f-86e5-422a-8360-117cf80b6f0b", lazyLoad=TRUE, limit="0MB", quiet=FALSE)
## Downloading package members for package with metadata identifier: urn:uuid:7255080f-86e5-422a-8360-117cf80b6f0b
## Downloaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:7255080f-86e5-422a-8360-117cf80b6f0b
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:ebdd46b9-b2fb-4efd-bb21-c50b96a5fa27
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:51e765db-ef7b-4001-b5af-81e8c263ee12
## Lazy Loaded object at URL https://test.arcticdata.io/metacat/d1/mn/v2/object/urn:uuid:280b3f16-efdb-426b-8ec9-e1f9427c30cf
## Getting resource map with id: resource_map_urn:uuid:7255080f-86e5-422a-8360-117cf80b6f0b
Get provenance relationships
saveWidth <- getOption("width")
options(width=120)
relFullProv <- getRelationships(fullProvPkg, condense=T)
prov <- subset(relFullProv, predicate=="prov:used" | predicate =="prov:wasDerivedFrom" | predicate=="prov:wasGeneratedBy")
Replace ids with names
provPlan <- subset(relFullProv,predicate == "prov:hadPlan")
provAsso <- subset(relFullProv,predicate == "prov:qualifiedAssociation")
provAsso$object <- provPlan$object[match(provAsso$object, provPlan$subject)]
indx <- match(prov$subject, provAsso$subject, nomatch = 0)
prov$subject[indx != 0] <- provAsso$object[indx]
indx <- match(prov$object, provAsso$subject, nomatch = 0)
prov$object[indx != 0] <- provAsso$object[indx]
prov <- data.frame(lapply(prov, as.character), stringsAsFactors=FALSE)
prov
##                   subject           predicate                               object
## 1        new_mtcars_g.png prov:wasDerivedFrom https://cn-stage...ac64-c6bc62d8d439
## 2        new_mtcars_g.png prov:wasDerivedFrom https://cn-stage...9992-15052b20f755
## 3        new_mtcars_g.png prov:wasGeneratedBy                       new_analysis.R
## 4 new_mtcars_gathered.csv prov:wasDerivedFrom https://cn-stage...ac64-c6bc62d8d439
## 5 new_mtcars_gathered.csv prov:wasDerivedFrom https://cn-stage...9992-15052b20f755
## 6 new_mtcars_gathered.csv prov:wasGeneratedBy                       new_analysis.R
## 7          new_analysis.R           prov:used https://cn-stage...ac64-c6bc62d8d439
## 8          new_analysis.R           prov:used https://cn-stage...9992-15052b20f755