Using dplyr:filter to create new variables in for loop for rMarkdown - for-loop

I have some R code that is meant to run a .rmd file so that I can create a unique report for multiple individuals at the same time.
Simplistically, my issue is that code chunk 1 creates a PDF output that is correct, however, will only create one PDF, and not one for each athlete in my dataset.
Code chunk 2 will create multiple PDF outputs (one for each athlete in the dataset), however, the variables created within the for loop (i.e., Athlete_full_name_i, Athlete_Sport_i) do not work. In the PDF output, these values are presented as "c("values","in","the","list")".
I think the code is close to working well but I would Love some advice to make this run smoother.
Thank you!
code chunk 1:
'''
#libraries
library(tidyverse)
library(rmarkdown)
library(dplyr)
#extra information
Folder_path1 <- "path to folder"
Excel_sheet1 <- "Data"
StartDate1 <- "2021-09-01"
EndDate1 <- Sys.Date()
#create athlete dataframe to run one, or multiple athlete reports at once.
Athlete_full_name <- c("John Smith", "Joe Tree") #as the name should appear in the report
Athlete_file_name <- c("SmithJ", "TreeJ") #Last name, first initial
Athlete_Team <- c("Men's Football", "Men's Darts") #Sport as it should appear in the report
Athlete_df <- data.frame(Athlete_full_name, Athlete_file_name, Athlete_Team)
Names <- unique(Athlete_df$Athlete_file_name)
#For loop - each athlete in Athlete_df gets unique report
for (i in 1:length(Names)) {
#creating new variables for each iteration
#determine athlete full name
Athlete_full_name_i <- Athlete_df %>%
filter(Athlete_file_name == Names[i]) %>%
subset(select = 1)
#determine athlete sport
Athlete_Sport_i <- Athlete_df %>%
filter(Athlete_file_name == Names[i]) %>%
subset(select = 3)
#create Athlete report with unique params and unique filename
rmarkdown::render("R2S_Dinos_JumpV3.Rmd",
params = list(Athlete_csvname = Names[i],
Athlete_fullname = Athlete_full_name_i,
Team = Athlete_Sport_i,
Folder_path = Folder_path1,
Excel_sheet = Excel_sheet1,
StartDate = StartDate1,
EndDate = EndDate1),
output_format = "pdf_document",
output_file=paste0("Athlete Reports/", Names[i], "_Report_", Sys.Date(), ".pdf"))
}
'''
code chunk 2:
'''
'''
#libraries
library(tidyverse)
library(rmarkdown)
library(dplyr)
#extra information
Folder_path1 <- "path to folder"
Excel_sheet1 <- "Data"
StartDate1 <- "2021-09-01"
EndDate1 <- Sys.Date()
#create athlete dataframe to run one, or multiple athlete reports at once.
Athlete_full_name <- c("John Smith", "Joe Tree") #as the name should appear in the report
Athlete_file_name <- c("SmithJ", "TreeJ") #Last name, first initial
Athlete_Team <- c("Men's Football", "Men's Darts") #Sport as it should appear in the report
Athlete_df <- data.frame(Athlete_full_name, Athlete_file_name, Athlete_Team)
#For loop - each athlete in Athlete_df gets unique report
for (Athlete_file_name in Athlete_df$Athlete_file_name) {
#creating new variables for each iteration
#determine athlete full name
Athlete_full_name_i <- Athlete_df %>%
filter(Athlete_file_name == Athlete_file_name) %>%
subset(select = 1)
#determine athlete sport
Athlete_Sport_i <- Athlete_df %>%
filter(Athlete_file_name == Athlete_file_name) %>%
subset(select = 3)
#create Athlete report with unique params and unique filename
rmarkdown::render("R2S_Dinos_JumpV3.Rmd",
params = list(Athlete_csvname = Athlete_file_name,
Athlete_fullname = Athlete_full_name_i,
Team = Athlete_Sport_i,
Folder_path = Folder_path1,
Excel_sheet = Excel_sheet1,
StartDate = StartDate1,
EndDate = EndDate1),
output_format = "pdf_document",
output_file=paste0("Athlete Reports/", Athlete_file_name, "_Report_", Sys.Date(), ".pdf"))
}
'''

When I run Code chunk 1 with a test Rmd output file it works as expected (two pdf files):
test_out.Rmd file:
---
title: "test_out"
params:
name: Bob
personid: 1
---
`r params$name` is person number `r params$personid`
Shortened slightly for clarity:
library(tidyverse)
library(rmarkdown)
Athlete_full_name <- c("John Smith", "Joe Tree")
Athlete_file_name <- c("SmithJ", "TreeJ")
Athlete_Team <- c("Men's Football", "Men's Darts")
Athlete_df <- data.frame(Athlete_full_name, Athlete_file_name, Athlete_Team)
Names <- unique(Athlete_df$Athlete_file_name)
for (i in 1:length(Names)) {
Athlete_full_name_i <- Athlete_df %>%
filter(Athlete_file_name == Names[i]) %>%
subset(select = 1)
Athlete_Sport_i <- Athlete_df %>%
filter(Athlete_file_name == Names[i]) %>%
subset(select = 3)
rmarkdown::render(
"test_out.Rmd",
params = list(Athlete_fullname = Athlete_full_name_i,
Team = Athlete_Sport_i),
output_format = "pdf_document",
output_file = paste0("test_out/", Names[i], "_Report_", Sys.Date(), ".pdf")
)
}
You could make this tidier and easier to debug with an anonymous function inside group_walk - this is the way I'd recommend doing it:
Athlete_df |>
rowwise() |>
group_walk(function(data, key) {
rmarkdown::render(
"test_out.Rmd",
params = list(
Athlete_fullname = data$Athlete_full_name,
Team = data$Athlete_Team
),
output_format = "pdf_document",
output_file = paste0("test_out/", data$Athlete_file_name, "_Report", ".pdf")
)
})
(same result)
In your code chunk 2 the problem comes when you filter with Athlete_file_name == Athlete_file_name. This tests whether these variables are equal within the dataframe, i.e. keep every row where the Athlete_file_name column equals the Athlete_file_name column, which means every row every time! Change the name of the iterating variable to e.g. in_name:
for (in_name in Athlete_df$Athlete_file_name) {
...
filter(Athlete_file_name == in_name) %>% # twice
...
output_file = paste0("test_out/", in_name,
"_Report_", Sys.Date(), ".pdf"
...
}

Thanks for the feedback everyone, there was an error in the for loop. The solution is as follows:
'''
#libraries
library(tidyverse)
library(rmarkdown)
library(dplyr)
#extra information
Folder_path1 <- "path to folder"
Excel_sheet1 <- "Data"
StartDate1 <- "2021-09-01"
EndDate1 <- Sys.Date()
#create athlete dataframe to run one, or multiple athlete reports at once.
Athlete_full_name <- c("John Smith", "Joe Tree") #as the name should appear in the report
Athlete_file_name <- c("SmithJ", "TreeJ") #Last name, first initial
Athlete_Team <- c("Men's Football", "Men's Darts") #Sport as it should appear in the report
Athlete_df <- data.frame(Athlete_full_name, Athlete_file_name, Athlete_Team)
Names <- unique(Athlete_df$Athlete_file_name)
#For loop - each athlete in Athlete_df gets unique report
for (i in Names) {
#creating new variables for each iteration
#determine athlete full name
Athlete_full_name_i <- Athlete_df %>%
filter(Athlete_file_name == i) %>%
subset(select = 1)
#determine athlete sport
Athlete_Sport_i <- Athlete_df %>%
filter(Athlete_file_name == i) %>%
subset(select = 3)
#create Athlete report with unique params and unique filename
rmarkdown::render("R2S_Dinos_JumpV3.Rmd",
params = list(Athlete_csvname = i,
Athlete_fullname = Athlete_full_name_i,
Team = Athlete_Sport_i,
Folder_path = Folder_path1,
Excel_sheet = Excel_sheet1,
StartDate = StartDate1,
EndDate = EndDate1),
output_format = "pdf_document",
output_file=paste0("Athlete Reports/", i, "_Report_", Sys.Date(), ".pdf"))
}

Related

Downloading multiple PDFs using Rselenium

I'm trying to download multiple PDFs by navigating on a page. Even though I'm able to navigate on the page using drop down and download PDF at the end. I am getting this error:
An element command failed because the referenced element is no longer attached to the DOM.
Below is my code:
library(RSelenium)
library(stringr)
rd<-rsDriver()
remDr<-rd[["client"]]
remDr$navigate("http://secc.gov.in/lgdStateList")
#First drop down
stateEle<-remDr$findElement("id", "lgdState")
states<-stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% str_trim('left')
states<-states[-1]
for (i in 1:length(states)) {
stateEle$clickElement()
stateEle$sendKeysToElement(list(states[i]))
stateEle$clickElement()
#Second drop down
distEle<-remDr$findElement("id", "lgdDistrict")
districts<-distEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist%>%str_trim('left')
districts<-districts[-1]
for (j in 1:length(districts)) {
distEle$clickElement()
distEle$sendKeysToElement(list(districts[[j]]))
distEle$clickElement()
#Third drop down
blockEle<-remDr$findElement("id", "lgdBlock")
block<-blockEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist%>%str_trim('left')
block<-block[-1]
for (k in 1:length(block)) {
blockEle$clickElement()
blockEle$sendKeysToElement(list(block[[k]]))
blockEle$clickElement()
gpEle<-remDr$findElements('class', 'statesrow')
for (m in 1:length(gpEle)) {
h<-unlist(gpEle[[m]]$getElementAttribute('innerHTML'))
h<-unlist(h%>% strsplit(., '<td>'))
h<-h[-1]
for (n in 1:length(h)) {
xpath1<-paste('//*[#id="example"]/tbody/tr[',m,']/td[',n,']/a')
pdfEle<-remDr$findElement('xpath', xpath1)
pdfEle$clickElement()
Sys.sleep(5)
}
}
}
}
}
As per your request
library(rvest)
url<-"http://secc.gov.in/lgdStateList"
page<-html_session(url)
## STATE LOOP ##
state <- html_nodes(page,css="#lgdState > option") %>% html_text()
state <- state[-1]
state_id <- html_nodes(page,css="#lgdState > option") %>% html_attr('value')
state_id <- state_id[-1]
for(i in 1:length(state)){
page1<-rvest:::request_POST(page, url="http://secc.gov.in/lgdDistrictList",
body=list(
"stateCode"=state_id[i]
),
encode="form")
## DISTRICT LOOP ##
district <- html_nodes(page1,css="#lgdDistrict > option") %>% html_text()
district <- district[-1]
district_id <- html_nodes(page1,css="#lgdDistrict > option") %>% html_attr('value')
district_id <- district_id[-1]
for(j in 1:length(district)){
page2<-rvest:::request_POST(page1,url="http://secc.gov.in/lgdBlockList",
body=list(
"stateCode"=state_id[i],
"districtCode"=district_id[j]
),
encode = "form")
## BLOCK LOOP ##
block <- html_nodes(page2, css="#lgdBlock > option") %>% html_text()
block <- block [-1]
block_id <- html_nodes(page2, css="#lgdBlock > option") %>% html_attr('value')
block_id <- block_id[-1]
for(k in 1:length(block)){
page3<-rvest:::request_POST(page2,url="http://secc.gov.in/lgdGpList",
body=list(
"stateCode"=state_id[i],
"districtCode"=district_id[j],
"blockCode"=block_id[k]
),
encode = "form")
txt <- html_nodes(page3,css="#example a") %>% html_attr("onclick")
library(stringr)
gpcode<-sapply(txt,function(x){
k <- str_extract_all(x, "\\([^()]+\\)")[[1]]
k <- substring(k, 2, nchar(k)-1)
regexp <- "[[:digit:]]+"
k <- str_extract(strsplit(k, ",")[[1]][4], regexp)
})
## GP CODE LOOP to download file ##
for(l in 1:length(gpcode)){
page4<-rvest:::request_POST(page3,url="http://secc.gov.in/downloadLgdwisePdfFile",
body=list(
"stateCode"=state_id[i],
"districtCode"=district_id[j],
"blockCode"=block_id[k],
"gpCode"=gpcode[l]
),
encode = "form")
error = "PDF File for this Gram Panchayat is not available."
error_displayed = try(html_nodes(page4,css=".error") %>% html_text())
if(error != error_displayed){
filename<-gsub("attachment;filename=","",page4$response$headers$`content-disposition`)
filename<-str_replace_all(filename, '"', "")
writeBin(page4$response$content,filename)
}
}
}
}
}
This is again without RSelenium. :)

R Shiny/Shinydashboard: Hiding the last part of a string in a table

I have a data table that contains some very wide columns and I want to add a scrolling-bar to make it more presentable. So far I have found examples using a scrolling-bar for the entire table - but ideally I would like to have a scrolling-bar for EACH column in the table if that is possible. Below there is an illustrating example. In this code I want a scrolling-bar for both "This_is_a_very_long_name_1", "This_is_a_very_long_name_2" etc.
library("shinydashboard")
library("shiny")
body <- dashboardBody(
fluidPage(
column(width = 4,
box(
title = "Box title", width = NULL, status = "primary",
div(style = 'overflow-x: scroll', tableOutput('table'))
)
)
)
)
ui <- dashboardPage(
dashboardHeader(title = "Column layout"),
dashboardSidebar(),
body
)
server <- function(input, output) {
test.table <- data.frame(lapply(1:8, function(x) {1:10}))
names(test.table) <- paste0('This_is_a_very_long_name_', 1:8)
output$table <- renderTable({
test.table
})
}
# Preview the UI in the console
shinyApp(ui = ui, server = server)
I thought about splitting the table into 8 tables, making a scrolling table for each of them and then putting them next to each other, but space was added betweeen them and it did not look that nice. I think it would be preferable to keeping it as one table (but suggestions are very welcome!).
Does anyone whether this is possible - and how to solve it?
Thanks in advance!
I would not recommend scrolling column header, i think it would not be very clear to read it or so. Here is the code which You can use to get the header in 2 lines so the columns are not too wide:
library("shinydashboard")
library("shiny")
library(DT)
test.table <- data.frame(lapply(1:8, function(x) {1:10}))
names(test.table) <- paste0('This_is_a_very_long_name_', 1:8)
body <- dashboardBody(
fluidPage(
column(width = 8,
box(
title = "Box title", width = NULL, status = "primary",
div(style = 'overflow-x: scroll', dataTableOutput('table'))
)
)
)
)
ui <- dashboardPage(
dashboardHeader(title = "Column layout"),
dashboardSidebar(),
body
)
server <- function(input, output) {
output$table <- renderDataTable({
names(test.table) <- gsub("_"," ",names(test.table))
datatable(test.table, options = list(columnDefs = list(list(width = '100px', targets = c(1:8)))))
})
}
# Preview the UI in the console
shinyApp(ui = ui, server = server)
[UPDATE] --> Column text rendering
Here is a one solution which can be usefull for You. There is no scrolling, however Your row text displays only first three characters (the number of characters displayed can be changed) and ..., with mouse over the row You get the pop up with whole variable name in this row:
library("shinydashboard")
library("shiny")
library(DT)
x <- c("aaaaaaaaaaaaaa", "bbbbbbbbbbbb", "ccccccccccc")
y <- c("aaaaaaaaaaaaaa", "bbbbbbbbbbbb", "ccccccccccc")
z <- c(1:3)
data <- data.frame(x,y,z)
body <- dashboardBody(
fluidPage(
column(width = 4,
box(
title = "Box title", width = NULL, status = "primary",
div(style = 'overflow-x: scroll', dataTableOutput('table'))
)
)
)
)
ui <- dashboardPage(
dashboardHeader(title = "Column layout"),
dashboardSidebar(),
body
)
server <- function(input, output) {
output$table <- renderDataTable({
datatable(data, options = list(columnDefs = list(list(
targets = c(1:3),
render = JS(
"function(data, type, row, meta) {",
"return type === 'display' && data.length > 3 ?",
"'<span title=\"' + data + '\">' + data.substr(0, 3) + '...</span>' : data;",
"}")),list(width = '100px', targets = c(1:3)))))
})
}
# Preview the UI in the console
shinyApp(ui = ui, server = server)

How to download edited table from selected tab in Shiny?

I have created a Shiny app which takes .csv file as input and generates tabs based on the Grade column.
The data looks like this
Name Age Score Grade
Jane 13 89 A
Hanna 14 67 B
Jack 13 80 A
Leena 14 78 B
Amy 12 65 B
Nina 14 90 A
Steven 12 45 C
Omy 13 59 C
The code will generate tables in each tab containing only rows of the dataset that match the Grade.
These tables are editable. I am trying to download the edited table from the active tabPanel, but I am stuck on what should be the content in the downloadHandler. I have attached my code for reference. Sorry for the messy code, I am fairly new to shiny.
library(shiny)
library(xlsx)
library(rhandsontable)
ui <- fluidPage(
titlePanel("Scores"),
sidebarLayout(
sidebarPanel(
fileInput("file", "Upload the file"),
br(),
downloadButton('downloadData', 'Save as excel')
),
mainPanel(uiOutput("op"))
)
)
server <- function(input, output, session) {
data <- reactive({
file1 <- input$file
if (is.null(file1)) {
return()
}
read.csv(file = file1$datapath)
})
fun1 <- function(x) {
mydf <- data()
DF <- mydf[(mydf$Grade == x), c(1:3)]
table <- renderRHandsontable({
newtable<- rhandsontable(DF, rowHeaders = NULL)
})
tabPanel(x, table)
}
output$op <- renderUI({
if (is.null(data()))
helpText("File not uploaded!")
else{
mydf <- data()
Tabs <- lapply((unique(mydf$Grade)), fun1)
do.call(tabsetPanel, c(id = "tabs", Tabs))
}
})
output$downloadData <- downloadHandler(
filename = function() {
'Edited table.xls'
},
# what should go in place of table
content = function(file) {
write.xlsx(table, file)
}
)
}
shinyApp(ui, server)

R: tm Textmining package: Doc-Level metadata generation is slow

I have a list of documents to process, and for each record I want to attach some metadata to the document "member" inside the "corpus" data structure that tm, the R package, generates (from reading in text files).
This for-loop works but it is very slow,
Performance seems to degrade as a function f ~ 1/n_docs.
for (i in seq(from= 1, to=length(corpus), by=1)){
if(opts$options$verbose == TRUE || i %% 50 == 0){
print(paste(i, " ", substr(corpus[[i]], 1, 140), sep = " "))
}
DublinCore(corpus[[i]], "title") = csv[[i,10]]
DublinCore(corpus[[i]], "Publisher" ) = csv[[i,16]] #institutions
}
This may do something to the corpus variable but I don't know what.
But when I put it inside a tm_map() (similar to lapply() function), it runs much faster, but the changes are not made persistent:
i = 0
corpus = tm_map(corpus, function(x){
i <<- i + 1
if(opts$options$verbose == TRUE){
print(paste(i, " ", substr(x, 1, 140), sep = " "))
}
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
})
Variable corpus has empty metadata fields after exiting the tm_map function. It should be filled. I have a few other things to do with the collection.
The R documentation for the meta() function says this:
Examples:
data("crude")
meta(crude[[1]])
DublinCore(crude[[1]])
meta(crude[[1]], tag = "Topics")
meta(crude[[1]], tag = "Comment") <- "A short comment."
meta(crude[[1]], tag = "Topics") <- NULL
DublinCore(crude[[1]], tag = "creator") <- "Ano Nymous"
DublinCore(crude[[1]], tag = "Format") <- "XML"
DublinCore(crude[[1]])
meta(crude[[1]])
meta(crude)
meta(crude, type = "corpus")
meta(crude, "labels") <- 21:40
meta(crude)
I tried many of these calls (with var "corpus" instead of "crude"), but they do not seem to work.
Someone else once seemed to have had the same problem with a similar data set (forum post from 2009, no response)
Here's a bit of benchmarking...
With the for loop :
expr.for <- function() {
for (i in seq(from= 1, to=length(corpus), by=1)){
DublinCore(corpus[[i]], "title") = LETTERS[round(runif(26))]
DublinCore(corpus[[i]], "Publisher" ) = LETTERS[round(runif(26))]
}
}
microbenchmark(expr.for())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.for() 21.50504 22.40111 23.56246 23.90446 70.12398
With tm_map :
corpus <- crude
expr.map <- function() {
tm_map(corpus, function(x) {
meta(x, "title") = LETTERS[round(runif(26))]
meta(x, "Publisher" ) = LETTERS[round(runif(26))]
x
})
}
microbenchmark(expr.map())
# Unit: milliseconds
# expr min lq median uq max
# 1 expr.map() 5.575842 5.700616 5.796284 5.886589 8.753482
So the tm_map version, as you noticed, seems to be about 4 times faster.
In your question you say that the changes in the tm_map version are not persistent, it is because you don't return x at the end of your anonymous function. In the end it should be :
meta(x, tag = "Heading") = csv[[i,10]]
meta(x, tag = "publisher" ) = csv[[i,16]]
x

Graphite / Carbon / Ceres node overlap

I'm working with Graphite monitoring using Carbon and Ceres as the storage method. I have some problems with correcting bad data. It seems that (due to various problems) I've ended up with overlapping files. That is, since Carbon / Ceres stores the data as timestamp#interval.slice, I can have two or more files with overlapping time ranges.
There are two kinds of overlaps:
File A: +------------+ orig file
File B: +-----+ subset
File C: +---------+ overlap
This is causing problems because the existing tools available (ceres-maintenance defrag and rollup) don't cope with these overlaps. Instead, they skip the directory and move on. This is a problem, obviously.
I've created a script that fixes this problem, as follows:
For subsets, just delete the subset file.
For overlaps, using the file system 'truncate' on the orig file at the point where the next file starts. While it is possible to cut off the start of the overlap file and rename it properly, I would suggest that this is fraught with danger.
I've found that it's possible to do this in two ways:
Walk the dirs and iterate over the files, fixing as you go, and find the file subsets, remove them;
Walk the dirs and fix all the problems in a dir before moving on. This is BY FAR the faster approach, since the dir walk is hugely time consuming.
Code:
#!/usr/bin/env python2.6
################################################################################
import io
import os
import time
import sys
import string
import logging
import unittest
import datetime
import random
import zmq
import json
import socket
import traceback
import signal
import select
import simplejson
import cPickle as pickle
import re
import shutil
import collections
from pymongo import Connection
from optparse import OptionParser
from pprint import pprint, pformat
################################################################################
class SliceFile(object):
def __init__(self, fname):
self.name = fname
basename = fname.split('/')[-1]
fnArray = basename.split('#')
self.timeStart = int(fnArray[0])
self.freq = int(fnArray[1].split('.')[0])
self.size = None
self.numPoints = None
self.timeEnd = None
self.deleted = False
def __repr__(self):
out = "Name: %s, tstart=%s tEnd=%s, freq=%s, size=%s, npoints=%s." % (
self.name, self.timeStart, self.timeEnd, self.freq, self.size, self.numPoints)
return out
def setVars(self):
self.size = os.path.getsize(self.name)
self.numPoints = int(self.size / 8)
self.timeEnd = self.timeStart + (self.numPoints * self.freq)
################################################################################
class CeresOverlapFixup(object):
def __del__(self):
import datetime
self.writeLog("Ending at %s" % (str(datetime.datetime.today())))
self.LOGFILE.flush()
self.LOGFILE.close()
def __init__(self):
self.verbose = False
self.debug = False
self.LOGFILE = open("ceresOverlapFixup.log", "a")
self.badFilesList = set()
self.truncated = 0
self.subsets = 0
self.dirsExamined = 0
self.lastStatusTime = 0
def getOptionParser(self):
return OptionParser()
def getOptions(self):
parser = self.getOptionParser()
parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="debug mode for this program, writes debug messages to logfile." )
parser.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False, help="verbose mode for this program, prints a lot to stdout." )
parser.add_option("-b", "--basedir", action="store", type="string", dest="basedir", default=None, help="base directory location to start converting." )
(options, args) = parser.parse_args()
self.debug = options.debug
self.verbose = options.verbose
self.basedir = options.basedir
assert self.basedir, "must provide base directory."
# Examples:
# ./updateOperations/1346805360#60.slice
# ./updateOperations/1349556660#60.slice
# ./updateOperations/1346798040#60.slice
def getFileData(self, inFilename):
ret = SliceFile(inFilename)
ret.setVars()
return ret
def removeFile(self, inFilename):
os.remove(inFilename)
#self.writeLog("removing file: %s" % (inFilename))
self.subsets += 1
def truncateFile(self, fname, newSize):
if self.verbose:
self.writeLog("Truncating file, name=%s, newsize=%s" % (pformat(fname), pformat(newSize)))
IFD = None
try:
IFD = os.open(fname, os.O_RDWR|os.O_CREAT)
os.ftruncate(IFD, newSize)
os.close(IFD)
self.truncated += 1
except:
self.writeLog("Exception during truncate: %s" % (traceback.format_exc()))
try:
os.close(IFD)
except:
pass
return
def printStatus(self):
now = self.getNowTime()
if ((now - self.lastStatusTime) > 10):
self.writeLog("Status: time=%d, Walked %s dirs, subsetFilesRemoved=%s, truncated %s files." % (now, self.dirsExamined, self.subsets, self.truncated))
self.lastStatusTime = now
def fixupThisDir(self, inPath, inFiles):
# self.writeLog("Fixing files in dir: %s" % (inPath))
if not '.ceres-node' in inFiles:
# self.writeLog("--> Not a slice directory, skipping.")
return
self.dirsExamined += 1
sortedFiles = sorted(inFiles)
sortedFiles = [x for x in sortedFiles if ((x != '.ceres-node') and (x.count('#') > 0)) ]
lastFile = None
fileObjList = []
for thisFile in sortedFiles:
wholeFilename = os.path.join(inPath, thisFile)
try:
curFile = self.getFileData(wholeFilename)
fileObjList.append(curFile)
except:
self.badFilesList.add(wholeFilename)
self.writeLog("ERROR: file %s, %s" % (wholeFilename, traceback.format_exc()))
# name is timeStart, really.
fileObjList = sorted(fileObjList, key=lambda thisObj: thisObj.name)
while fileObjList:
self.printStatus()
changes = False
firstFile = fileObjList[0]
removedFiles = []
for curFile in fileObjList[1:]:
if (curFile.timeEnd <= firstFile.timeEnd):
# have subset file. elim.
self.removeFile(curFile.name)
removedFiles.append(curFile.name)
self.subsets += 1
changes = True
if self.verbose:
self.writeLog("Subset file situation. First=%s, overlap=%s" % (firstFile, curFile))
fileObjList = [x for x in fileObjList if x.name not in removedFiles]
if (len(fileObjList) < 2):
break
secondFile = fileObjList[1]
# LT is right. FirstFile's timeEnd is always the first open time after first is done.
# so, first starts#100, len=2, end=102, positions used=100,101. second start#102 == OK.
if (secondFile.timeStart < firstFile.timeEnd):
# truncate first file.
# file_A (last): +---------+
# file_B (curr): +----------+
# solve by truncating previous file at startpoint of current file.
newLenFile_A_seconds = int(secondFile.timeStart - firstFile.timeStart)
newFile_A_datapoints = int(newLenFile_A_seconds / firstFile.freq)
newFile_A_bytes = int(newFile_A_datapoints) * 8
if (not newFile_A_bytes):
fileObjList = fileObjList[1:]
continue
assert newFile_A_bytes, "Must have size. newLenFile_A_seconds=%s, newFile_A_datapoints=%s, newFile_A_bytes=%s." % (newLenFile_A_seconds, newFile_A_datapoints, newFile_A_bytes)
self.truncateFile(firstFile.name, newFile_A_bytes)
if self.verbose:
self.writeLog("Truncate situation. First=%s, overlap=%s" % (firstFile, secondFile))
self.truncated += 1
fileObjList = fileObjList[1:]
changes = True
if not changes:
fileObjList = fileObjList[1:]
def getNowTime(self):
return time.time()
def walkDirStructure(self):
startTime = self.getNowTime()
self.lastStatusTime = startTime
updateStatsDict = {}
self.okayFiles = 0
emptyFiles = 0
for (thisPath, theseDirs, theseFiles) in os.walk(self.basedir):
self.printStatus()
self.fixupThisDir(thisPath, theseFiles)
self.dirsExamined += 1
endTime = time.time()
# time.sleep(11)
self.printStatus()
self.writeLog( "now = %s, started at %s, elapsed time = %s seconds." % (startTime, endTime, endTime - startTime))
self.writeLog( "Done.")
def writeLog(self, instring):
print instring
print >> self.LOGFILE, instring
self.LOGFILE.flush()
def main(self):
self.getOptions()
self.walkDirStructure()

Resources