Created
November 27, 2022 01:39
-
-
Save bohdanszymanik/847f505db17ea0198f3e37b6bbc53f13 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
remove.packages("rlang") | |
remove.packages("tibble") | |
rlangUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/rlang_1.0.2.zip" | |
install.packages(rlangUrl, repos=NULL, type="binary") | |
tcltkUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/tcltk2_1.2-11.zip" | |
install.packages(tcltkUrl, repos=NULL, type="binary") | |
knitrUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/knitr_1.38.zip" | |
install.packages(knitrUrl, repos=NULL, type="binary") | |
rglUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/rgl_0.108.3.zip" | |
install.packages(rglUrl, repos=NULL, type="binary") | |
fastmapUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/fastmap_1.1.0.zip" | |
install.packages(fastmapUrl, repos=NULL, type="binary") | |
htmltoolsUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/htmltools_0.5.2.zip" | |
install.packages(htmltoolsUrl, repos=NULL, type="binary") | |
htmlwidgetsUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/htmlwidgets_1.5.4.zip" | |
install.packages(htmlwidgetsUrl, repos=NULL, type="binary") | |
igraphUrl <- "https://cran.r-project.org/bin/windows/contrib/4.0/igraph_1.3.0.zip" | |
install.packages(igraphUrl, repos=NULL, type="binary") | |
DBIurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/DBI_1.1.2.zip" | |
install.packages(DBIurl, repos=NULL, type="binary") | |
rJavaurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/rJava_1.0-6.zip" | |
install.packages(rJavaurl, repos=NULL, type="binary") | |
rjdbcurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/RJDBC_0.2-10.zip" | |
install.packages(rjdbcurl, repos=NULL, type="binary") | |
tweenr <- "https://cran.r-project.org/bin/windows/contrib/4.0/tweenr_1.0.2.zip" | |
install.packages(tweenr, repos=NULL, type="binary") | |
polyclipurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/polyclip_1.10-0.zip" | |
install.packages(polyclipurl, repos=NULL, type="binary") | |
ggforceurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggforce_0.3.3.zip" | |
install.packages(ggforceurl, repos=NULL, type="binary") | |
ggraphurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggraph_2.0.5.zip" | |
install.packages(ggraphurl, repos=NULL, type="binary") | |
ggrepelurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggrepel_0.9.1.zip" | |
install.packages(ggrepelurl, repos=NULL, type="binary") | |
graphlayoutsurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/graphlayouts_0.8.0.zip" | |
install.packages(graphlayoutsurl, repos=NULL, type="binary") | |
pillarurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/pillar_1.7.0.zip" | |
install.packages(pillarurl, repos=NULL, type="binary") | |
tidyselecturl <- "https://cran.r-project.org/bin/windows/contrib/4.0/tidyselect_1.1.2.zip" | |
install.packages(tidyselecturl, repos=NULL, type="binary") | |
ggplot2url <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggplot2_3.3.5.zip" | |
install.packages(ggplot2url, repos=NULL, type="binary") | |
ggfunurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggfun_0.0.6.zip" | |
install.packages(ggfunurl, repos=NULL, type="binary") | |
scalesurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/scales_1.2.0.zip" | |
install.packages(scalesurl, repos=NULL, type="binary") | |
ggimageurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/ggimage_0.3.0.zip" | |
install.packages(ggimageurl, repos=NULL, type="binary") | |
tibbleurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/tibble_3.1.6.zip" | |
install.packages(tibbleurl, repos=NULL, type="binary") | |
getpassurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/getPass_0.2-2.zip" | |
install.packages(getpassurl, repos=NULL, type="binary") | |
dplyrurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/dplyr_1.0.8.zip" | |
install.packages(dplyrurl, repos=NULL, type="binary") | |
tibbleurl <- "https://cran.r-project.org/bin/windows/contrib/4.0/tibble_3.1.6.zip" | |
install.packages(tibbleurl, repos=NULL, type="binary") | |
install.packages("tidygraph") | |
install.packages("tidyr") | |
install.packages("tictoc") | |
library(rlang) | |
library(tcltk) | |
library(igraph) | |
library(htmlwidgets) | |
library(htmltools) | |
library(fastmap) | |
library(rgl) | |
library(knitr) | |
library(DBI) | |
library(rJava) | |
library(RJDBC) | |
library(tibble) | |
library(getPass) | |
library(dplyr) | |
library(tidyr) | |
library(purrr) | |
library(pillar) | |
library(tidygraph) | |
library(graphlayouts) | |
library(ggrepel) | |
library(tweenr) | |
library(polyclip) | |
library(ggforce) | |
library(ggraph) | |
library(ggplot2) | |
library(tictoc) | |
library(ggimage) | |
# useful references | |
# https://jeremydfoote.com/Communication-and-Social-Networks/resources/ggraph_walkthrough.html | |
# https://www.r-bloggers.com/2019/03/graph-analysis-using-the-tidyverse/ | |
# https://www.jessesadler.com/post/network-analysis-with-r/ | |
# OK, so far so good, now let's connect to DB and start querying | |
# tuples | |
jcc = JDBC("com.ibm.db2.jcc.DB2Driver", | |
"C:/../java/db2jcc4.jar") | |
# this is not the best way to get the password - displays it as a value in RStudio | |
# pwd <- getPass::getPass() | |
# instead just put a pwd dialog box into the connection string dbConnect function parameters | |
# more info in https://solutions.rstudio.com/db/best-practices/managing-credentials/ | |
#Connection String - either one works | |
# conn = dbConnect(jcc,"jdbc:db2://10.113.91.195:50000/DBNIAR02",<uid here>,getPass::getPass()) | |
conn = dbConnect(jcc,"jdbc:db2://some db2 database host:50000/some database",rstudioapi::askForPassword("Enter username"),rstudioapi::askForPassword("Enter password")) | |
# I can do two years worth of links with a graph made up from numeric node ids, but with character ids then I | |
# need to drop back to less eg maybe 1 year because performance drops | |
q <- " select | |
sourceid as Source | |
,targetid as Target | |
,sourceType | |
,targetType | |
,linkType | |
from someschema.someLinkTable | |
where months_between(current date, creationDate) < 6 | |
" | |
# depending upon implementation this might give us a single link | |
# or 2 records for any source/target pair, one for the | |
# source - target, and another reciprocal but otherwise identical link with source | |
# and target reversed | |
rs <- dbSendQuery(conn, q) | |
dft <- tibble::as_tibble(fetch(rs, -1)) | |
# dft is effectively an edge list giving source target | |
# + attributes of source and target | |
# + attributes of the edge | |
# to get tidygraph to load attributes for nodes we need to generate a node list | |
nodes <- | |
dplyr::bind_rows( | |
dft %>% select(id=SOURCE, entityType=SOURCETYPE), | |
dft %>% select(id=TARGET, entityType=TARGETTYPE)) %>% | |
distinct() | |
edges <- | |
dft %>% | |
mutate(linkType=stringr::str_trim(LINKTYPE)) %>% | |
select(from=SOURCE, to=TARGET, linkType) | |
nodes | |
edges | |
nodes %>% distinct(entityType) | |
# # A tibble: with some nodes of some entity or another | |
nodes %>% | |
dplyr::group_by(entityType) %>% | |
dplyr::summarise(n=n()) | |
# same same as | |
nodes %>% | |
dplyr::count(entityType) | |
edges %>% | |
dplyr::group_by(linkType) %>% | |
dplyr::summarise(n=n()) | |
# or | |
edges %>% | |
dplyr::count(linkType) | |
# # A tibble: of edges of differing link types | |
# strange... this doesn't work but... | |
graph <- tbl_graph(nodes=nodes, | |
edges=edges, | |
directed=F) | |
# this does work - I wonder if connected to this issues somehow https://github.com/thomasp85/tidygraph/issues/89 | |
# needs to be undirected for the shortest_paths function to work | |
graph <- igraph::graph_from_data_frame(edges, vertices = nodes, directed=F) %>% as_tbl_graph() | |
graph | |
# Actually, I think it's more about this issue https://stackoverflow.com/questions/50457926/tidygraph-and-igraph-build-graph-from-dataframe-discrepancy | |
nodesChar <- | |
dplyr::bind_rows( | |
dft %>% select(id=SOURCE, entityType=SOURCETYPE), | |
dft %>% select(id=TARGET, entityType=TARGETTYPE)) %>% | |
distinct() %>% | |
mutate(id = as.character(id)) | |
nodesChar | |
# # A tibble of nodes using chr to identify the ids | |
edgesChar <- | |
dft %>% | |
mutate(linkType=stringr::str_trim(LINKTYPE)) %>% | |
select(from=SOURCE, to=TARGET, linkType) %>% | |
mutate_at(vars(from, to), as.character) | |
edgesChar | |
# # A tibble of edges and edge types with chrs as the id types | |
graphChar <- tbl_graph(nodes=nodesChar, | |
edges=edgesChar, | |
directed=F) | |
# yes, that fixed it - now tbl_graph works | |
# let's now join up with some other known ids | |
pois <- tribble( | |
~pid, ~pidChar, | |
1, "1", | |
2,"2" | |
3,"3" | |
) | |
# lets check to see if we can find these people | |
# below works for numeric ids in nodes | |
pois %>% dplyr::inner_join(nodes, by = c("pid" = "id")) | |
# but we now have character node ids | |
pois %>% dplyr::inner_join(nodesChar, by = c("pidChar" = "id")) | |
# all two element combinations of POIs | |
combs <- combn(pois$pid,2) | |
combsChar <- combn(pois$pidChar,2) | |
# quick check on one of the nodes we know exists that we can find it | |
V(graph)[name==5] | |
V(graphChar)[id=="8"] | |
myplots <- vector('list', ncol(combs)) | |
sps <- | |
for (col in 1:ncol(combs)) { | |
print(paste(as.character(combs[1,col]), as.character(combs[2,col]))) | |
path <- igraph::shortest_paths(graph, | |
from = V(graph)[id==combs[1,col]], | |
to = V(graph)[id==combs[2,col]]) | |
if (length(path$vpath[[1]]) > 1) { | |
print(paste("Found a path, plotting a graph over", length(path$vpath[[1]]), "nodes")) | |
myplots[[col]] <- igraph::induced_subgraph(graph, vids = unlist(path$vpath[1])) %>% | |
tidygraph::as_tbl_graph() %>% | |
ggraph(layout = "nicely") + | |
geom_edge_link() + | |
# geom_node_point(size = 10, fill = "white", shape = 21) + | |
geom_image(aes(x = x, y = y, image= | |
case_when( | |
entityType == 'Person' ~ 'C:\\..\\Person.png', | |
entityType == 'Organisation' ~ 'C:\\..\\Organization.png', | |
entityType == 'Location' ~ 'C:\\..\\Place.png', | |
) | |
), size = 0.05) + | |
geom_label(aes(x = x, y = y, label = id, colour=factor(entityType)), nudge_y = 0.2, nudge_x = 0.2, label.size = NA) + | |
# geom_node_text(aes(label = name), repel = TRUE) + | |
theme_graph() | |
} | |
} | |
Filter(Negate(is.null),myplots) | |
# why am I often only getting paths involving people and not other entity types? | |
# figuring out how these functions to find shortest paths work | |
t <- igraph::shortest_paths(graph, "2", "6") | |
length(t$vpath[[1]]) | |
for (n in t$vpath[1]) { | |
print(paste("n", n)) | |
} | |
allT <- igraph::all_shortest_paths(graph, "3", "16") | |
length(allT$res) | |
length(allT$res[[1]]) | |
length(allT$res[1]) | |
length(allT$res[2]) | |
length(allT$res[3]) | |
length(allT$res[[2]]) | |
length(allT$res[[3]]) | |
unlist(allT$res) | |
for (n in allT$vpath[1]) { | |
print(paste("n", n)) | |
} | |
allT1 <- igraph::all_shortest_paths(graph, "17", "15") | |
# here's the question - how would I construct a new graph made up of the resulting paths? | |
graph | |
# graph %>% | |
# activate(nodes) %>% | |
# filter(selected_node==80282) | |
sub_graph <- graph %>% | |
morph(to_subgraph, id %in% t$vpath[1]) | |
sub_graph <- to_subgraph(graph, id %in% t$vpath[1], subset_by = "nodes")$subgraph | |
sub_graph <- to_subgraph(graph, id %in% c(80282), subset_by = "nodes")$subgraph | |
# well this works! | |
igraph::induced_subgraph(graph, vids = unlist(t$vpath[1])) %>% | |
tidygraph::as_tbl_graph() %>% | |
ggraph(layout = "nicely") + | |
geom_edge_link() + | |
geom_node_point(size = 10, fill = "white", shape = 21) + | |
geom_label(aes(x = x, y = y, label = name, colour=factor(entityType)), nudge_y = 0.2, nudge_x = 0.2, label.size = NA) + | |
# geom_node_text(aes(label = name), repel = TRUE) + | |
theme_graph() | |
ggraph(sub_graph) + | |
geom_edge_link() + | |
geom_node_point() + | |
theme_graph() | |
# to plot with icons how about the following icons? | |
# https://ionic.io/ionicons | |
# geom_icon() | |
# body-outline -> a person | |
# map-outline -> a map | |
# people-outline -> org | |
# lots of emoji/dingbat icons we could also use https://apps.timwhitlock.info/emoji/tables/unicode | |
# geom_emoji() | |
# ggimage::geom_image() enables use of an arbitrary image file so we could use icons we have on disk | |
# https://stackoverflow.com/questions/16300344/how-to-flatten-a-list-of-lists | |
# in following c is applied to the list items to create a vector, do I need a vector here or just the list? | |
# do.call(c, unlist(foolist, recursive=FALSE)) | |
# this time we use all_shortest_paths and plot them all | |
# drats ... this doesn't work with nodes/edges having a large dataset size and character ids - runs out of memory | |
myplots <- vector('list', ncol(combs)) | |
spsAll <- | |
for (col in 1:ncol(combs)) { | |
print(paste(as.character(combs[1,col]), as.character(combs[2,col]))) | |
paths <- igraph::all_shortest_paths(graph, | |
from = V(graph)[name==combs[1,col]], | |
to = V(graph)[name==combs[2,col]]) | |
if (length(paths$res) > 1) { | |
print(paste("Found a path, plotting a graph over", length(unlist(path$res)), "nodes")) | |
myplots[[col]] <- igraph::induced_subgraph(graph, vids = unlist(paths$res)) %>% | |
tidygraph::as_tbl_graph() %>% | |
ggraph(layout = "nicely") + | |
geom_edge_link() + | |
# geom_node_point(size = 10, fill = "white", shape = 21) + | |
geom_image(aes(x = x, y = y, image= | |
case_when( | |
entityType == 'Person' ~ 'C:\\..\\Person.png', | |
entityType == 'Organisation' ~ 'C:\\..\\Organization.png', | |
entityType == 'Location' ~ 'C:\\..\\Place.png', | |
) | |
), size = 0.05) + | |
geom_label(aes(x = x, y = y, label = name, colour=factor(entityType)), nudge_y = 0.2, nudge_x = 0.2, label.size = NA) + | |
# geom_node_text(aes(label = name), repel = TRUE) + | |
theme_graph() | |
} | |
} | |
Filter(Negate(is.null),myplots) | |
# some of the nodes are massively connected - I suspect locations like major cities etc | |
# I think I can usefully simplify the graph if I get rid of them | |
# in the hope this addresses some of my memory errors | |
vertexEdges <- degree(graph) | |
sort(vertexEdges, decreasing=T) | |
sort(vertexEdges[vertexEdges<100], decreasing=T) | |
vertexEdges[1] | |
# turns out that there aren't so many highly connected nodes | |
# and all vertices have at least one edge | |
length(vertexEdges) | |
length(vertexEdges[vertexEdges<10]) | |
degree_distribution(graph) | |
vertexEdges[vertexEdges==9] | |
length(V(graph)) #2421707 | |
length(V(graph)[degree(graph)<10]) #2313383 | |
neighbors(graph, "73") | |
neighbors(graph, "6") # returns x neighbour vertices which is what we expect | |
neighbors(graph, "306") # returns y neighbour vertices which is what we expect | |
neighbors(graph, "973") # returns z neighbours which is what we expect | |
# this works but saves only roughly 1/3rd memory | |
graphSmaller <- igraph::induced_subgraph(graph, vids=V(graph)[degree(graph)<10], impl="auto") %>% tidygraph::as_tbl_graph() | |
# little confused at how these graphs are represented in the RStudio Data viewer | |
# when small it's just 'List of some number of nodes | |
# when large it's 'Large tbl_graph (some number of elements, some MB) | |
t1 <- igraph::make_ring(100) %>% tidygraph::as_tbl_graph() # List of 100 | |
t2 <- igraph::make_ring(100000) %>% tidygraph::as_tbl_graph() # Large tbl_graph (100000 elements, 4.8 MB) | |
t3 <- igraph::make_ring(100000) # Large igraph (100000 elements, 4.8 MB) | |
# reclaim our resources | |
rm("t1") | |
rm("t2") | |
rm("t3") | |
gc() | |
# so let's try with our smaller graph - drats <10 connections makes for no discovered paths | |
# let's increase | |
graphSmaller <- igraph::induced_subgraph(graph, vids=V(graph)[degree(graph)<20], impl="auto") %>% tidygraph::as_tbl_graph() | |
myplots <- vector('list', ncol(combs)) | |
spsAll <- | |
for (col in 1:ncol(combs)) { | |
print(paste(as.character(combs[1,col]), as.character(combs[2,col]))) | |
paths <- igraph::all_shortest_paths(graphSmaller, | |
from = V(graphSmaller)[name==combs[1,col]], | |
to = V(graphSmaller)[name==combs[2,col]]) | |
if (length(paths$res) > 1) { | |
print(paste("Found a path, plotting a graph over", length(unlist(path$res)), "nodes")) | |
myplots[[col]] <- igraph::induced_subgraph(graphSmaller, vids = unlist(paths$res)) %>% | |
tidygraph::as_tbl_graph() %>% | |
ggraph(layout = "nicely") + | |
geom_edge_link() + | |
# geom_node_point(size = 10, fill = "white", shape = 21) + | |
geom_image(aes(x = x, y = y, image= | |
case_when( | |
entityType == 'Person' ~ 'C:\\..\\Person.png', | |
entityType == 'Organisation' ~ 'C:\\..\\Organization.png', | |
entityType == 'Location' ~ 'C:\\..\\Place.png', | |
) | |
), size = 0.05) + | |
geom_label(aes(x = x, y = y, label = name, colour=factor(entityType)), nudge_y = 0.2, nudge_x = 0.2, label.size = NA) + | |
# geom_node_text(aes(label = name), repel = TRUE) + | |
theme_graph() | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment