LahmanData {Lahman}R Documentation

Lahman Datasets

Description

This dataset gives a consise description of the data files in the Lahman package. It may be useful for computing on the various files.

Usage

data(LahmanData)

Format

A data frame with 24 observations on the following 5 variables.

file

name of dataset

class

class of dataset

nobs

number of observations

nvar

number of variables

title

dataset title

Details

This dataset is generated using vcdExtra::datasets(package="Lahman") with some post-processing.

Examples


data(LahmanData)

# find ID variables in the datasets
IDvars <- lapply(LahmanData[,"file"], function(x) grep('.*ID$', colnames(get(x)), value=TRUE))
names(IDvars) <- LahmanData[,"file"]
str(IDvars)
## List of 24
##  $ AllstarFull        : chr [1:5] "playerID" "yearID" "gameID" "teamID" ...
##  $ Appearances        : chr [1:4] "yearID" "teamID" "lgID" "playerID"
##  $ AwardsManagers     : chr [1:4] "managerID" "awardID" "yearID" "lgID"
##  $ AwardsPlayers      : chr [1:4] "playerID" "awardID" "yearID" "lgID"
##  $ AwardsShareManagers: chr [1:4] "awardID" "yearID" "lgID" "managerID"
##  $ AwardsSharePlayers : chr [1:4] "awardID" "yearID" "lgID" "playerID"
##  $ Batting            : chr [1:4] "playerID" "yearID" "teamID" "lgID"
##  $ BattingPost        : chr [1:4] "yearID" "playerID" "teamID" "lgID"
##  $ Fielding           : chr [1:4] "playerID" "yearID" "teamID" "lgID"
##  $ FieldingOF         : chr [1:2] "playerID" "yearID"
##  $ FieldingPost       : chr [1:4] "playerID" "yearID" "teamID" "lgID"
##  $ HallOfFame         : chr [1:2] "hofID" "yearID"
##  $ Managers           : chr [1:4] "managerID" "yearID" "teamID" "lgID"
##  $ ManagersHalf       : chr [1:4] "managerID" "yearID" "teamID" "lgID"
##  $ Master             : chr [1:9] "lahmanID" "playerID" "managerID" "hofID" ...
##  $ Pitching           : chr [1:4] "playerID" "yearID" "teamID" "lgID"
##  $ PitchingPost       : chr [1:4] "playerID" "yearID" "teamID" "lgID"
##  $ Salaries           : chr [1:4] "yearID" "teamID" "lgID" "playerID"
##  $ Schools            : chr "schoolID"
##  $ SchoolsPlayers     : chr [1:2] "playerID" "schoolID"
##  $ SeriesPost         : chr "yearID"
##  $ Teams              : chr [1:5] "yearID" "lgID" "teamID" "franchID" ...
##  $ TeamsFranchises    : chr "franchID"
##  $ TeamsHalf          : chr [1:4] "yearID" "lgID" "teamID" "divID"
# vector of unique ID variables
unique(unlist(IDvars))
##  [1] "playerID"   "yearID"     "gameID"     "teamID"     "lgID"      
##  [6] "managerID"  "awardID"    "hofID"      "lahmanID"   "lahman40ID"
## [11] "lahman45ID" "retroID"    "holtzID"    "bbrefID"    "schoolID"  
## [16] "franchID"   "divID"

# which datasets have playerID?
names(which(sapply(IDvars, function(x) "playerID" %in% x)))
##  [1] "AllstarFull"        "Appearances"        "AwardsPlayers"     
##  [4] "AwardsSharePlayers" "Batting"            "BattingPost"       
##  [7] "Fielding"           "FieldingOF"         "FieldingPost"      
## [10] "Master"             "Pitching"           "PitchingPost"      
## [13] "Salaries"           "SchoolsPlayers"

################################################
# Visualize relations among datasets via an MDS
################################################
# jaccard distance between two sets; assure positivity
jaccard <- function(A, B) {
    max(1 - length(intersect(A,B)) / length(union(A,B)), .00001)
}   

distmat <- function(vars, FUN=jaccard) {
    nv <- length(vars)
    d <- matrix(0, nv, nv, dimnames=list(names(vars), names(vars)))
    for(i in 1:nv) {
        for (j in 1:nv) {
            if (i != j) d[i,j] <- FUN(vars[[i]], vars[[j]])
        }
    }
    d
}

# do an MDS on distances
distID <- distmat(IDvars)
config <- cmdscale(distID)

pos=rep(1:4, length=nrow(config))
plot(config[,1], config[,2], xlab = "", ylab = "", asp = 1, axes=FALSE,
    main="MDS of ID variable distances of Lahman tables")
abline(h=0, v=0, col="gray80")
text(config[,1], config[,2], rownames(config), cex = 0.75, pos=pos, xpd=NA)

plot of chunk unnamed-chunk-1



[Package Lahman version 2.0-1 Index]