Hbase scan with offset - hadoop

Is there a way to scan a HBase table getting, for example, the first 100
results, then later get the next 100 and so on... Just like in SQL we do
with LIMIT and OFFSET?
My row keys are uuid

You can do it multiple ways. The easiest one is a page filter. Below is the code example from HBase: The Definitive Guide, page 150.
private static final byte[] POSTFIX = new byte[] { 0x00 };
Filter filter = new PageFilter(15);
int totalRows = 0; byte[] lastRow = null;
while (true) {
Scan scan = new Scan();
scan.setFilter(filter);
if (lastRow != null) {
byte[] startRow = Bytes.add(lastRow, POSTFIX);
System.out.println("start row: " + Bytes.toStringBinary(startRow));
scan.setStartRow(startRow);
}
ResultScanner scanner = table.getScanner(scan);
int localRows = 0;

 Result result;

 while ((result = scanner.next()) != null) {
System.out.println(localRows++ + ": " + result);
totalRows++;
lastRow = result.getRow();
}
scanner.close();
if (localRows == 0) break;
}

System.out.println("total rows: " + totalRows);
Or you can set catching on scan for the limit you want and then change the start row to the last row + 1 from the prev scan for every get.

Related

Fastest way to search for a row in a large Google Sheet using/in Google Apps Script

GAS is quite powerful and you could write a full fledged web-app using a Google Sheet as the DB back-end. There are many reasons not to do this but I figure in some cases it is okay.
I think the biggest issue will be performance issues when looking for rows based on some criteria in a sheet with a lot of rows. I know there are many ways to "query" a sheet but I can't find reliable information on which is the fastest.
One of the complexities is that many people can edit a sheet which means there are a variable number of situations you'd have to account for. For the sake of simplicity, I want to assume the sheet:
Is locked down so only one person can see it
The first column has the row number (=row())
The most basic query is finding a row where a specific column equals some value.
Which method would be the fastest?
I have a sheet with ~19k rows and ~38 columns, filled with all sorts of unsorted real-world data. That is almost 700k rows so I figured it would be a good sheet to time a few methods and see which is the fastest.
method 1: get sheet as a 2D array then go through each row
method 2: get sheet as a 2D array, sort it, then using a binary search algorithm to find the row
method 3: make a UrlFetch call to Google visualization query and don't provide last row
method 4: make a UrlFetch call to Google visualization query and provide last row
Here are the my query functions.
function method1(spreadsheetID, sheetName, columnIndex, query)
{
// get the sheet values excluding header,
var rowValues = SpreadsheetApp.openById(spreadsheetID).getSheetByName(sheetName).getSheetValues(2, 1, -1, -1);
// loop through each row
for(var i = 0, numRows = rowValues.length; i < numRows; ++i)
{
// return it if found
if(rowValues[i][columnIndex] == query) return rowValues[i]
}
return false;
}
function method2(spreadsheetID, sheetName, columnIndex, query)
{
// get the sheet values excluding header
var rowValues = SpreadsheetApp.openById(spreadsheetID).getSheetByName(sheetName).getSheetValues(2, 1, -1, -1);
// sort it
rowValues.sort(function(a, b){
if(a[columnIndex] < b[columnIndex]) return -1;
if(a[columnIndex] > b[columnIndex]) return 1;
return 0;
});
// search using binary search
var foundRow = matrixBinarySearch(rowValues, columnIndex, query, 0, rowValues.length - 1);
// return if found
if(foundRow != -1)
{
return rowValues[foundRow];
}
return false;
}
function method3(spreadsheetID, sheetName, queryColumnLetterStart, queryColumnLetterEnd, queryColumnLetterSearch, query)
{
// SQL like query
myQuery = "SELECT * WHERE " + queryColumnLetterSearch + " = '" + query + "'";
// the query URL
// don't provide last row in range selection
var qvizURL = 'https://docs.google.com/spreadsheets/d/' + spreadsheetID + '/gviz/tq?tqx=out:json&headers=1&sheet=' + sheetName + '&range=' + queryColumnLetterStart + ":" + queryColumnLetterEnd + '&tq=' + encodeURIComponent(myQuery);
// fetch the data
var ret = UrlFetchApp.fetch(qvizURL, {headers: {Authorization: 'Bearer ' + ScriptApp.getOAuthToken()}}).getContentText();
// remove some crap from the return string
return JSON.parse(ret.replace("/*O_o*/", "").replace("google.visualization.Query.setResponse(", "").slice(0, -2));
}
function method4(spreadsheetID, sheetName, queryColumnLetterStart, queryColumnLetterEnd, queryColumnLetterSearch, query)
{
// find the last row in the sheet
var lastRow = SpreadsheetApp.openById(spreadsheetID).getSheetByName(sheetName).getLastRow();
// SQL like query
myQuery = "SELECT * WHERE " + queryColumnLetterSearch + " = '" + query + "'";
// the query URL
var qvizURL = 'https://docs.google.com/spreadsheets/d/' + spreadsheetID + '/gviz/tq?tqx=out:json&headers=1&sheet=' + sheetName + '&range=' + queryColumnLetterStart + "1:" + queryColumnLetterEnd + lastRow + '&tq=' + encodeURIComponent(myQuery);
// fetch the data
var ret = UrlFetchApp.fetch(qvizURL, {headers: {Authorization: 'Bearer ' + ScriptApp.getOAuthToken()}}).getContentText();
// remove some crap from the return string
return JSON.parse(ret.replace("/*O_o*/", "").replace("google.visualization.Query.setResponse(", "").slice(0, -2));
}
My binary search algorithm:
function matrixBinarySearch(matrix, columnIndex, query, firstIndex, lastIndex)
{
// find the value using binary search
// https://www.w3resource.com/javascript-exercises/javascript-array-exercise-18.php
// first make sure the query string is valid
// if it is less than the smallest value
// or larger than the largest value
// it is not valid
if(query < matrix[firstIndex][columnIndex] || query > matrix[lastIndex][columnIndex]) return -1;
// if its the first row
if(query == matrix[firstIndex][columnIndex]) return firstIndex;
// if its the last row
if(query == matrix[lastIndex][columnIndex]) return lastIndex;
// now start doing binary search
var middleIndex = Math.floor((lastIndex + firstIndex)/2);
while(matrix[middleIndex][columnIndex] != query && firstIndex < lastIndex)
{
if(query < matrix[middleIndex][columnIndex])
{
lastIndex = middleIndex - 1;
}
else if(query > matrix[middleIndex][columnIndex])
{
firstIndex = middleIndex + 1;
}
middleIndex = Math.floor((lastIndex + firstIndex)/2);
}
return matrix[middleIndex][columnIndex] == query ? middleIndex : -1;
}
This is the function I used to test them all:
// each time this function is called it will try one method
// the first time it is called it will try method1
// then method2, then method3, then method4
// after it does method4 it will start back at method1
// we will use script properties to save which method is next
// we also want to use the same query string for each batch so we'll save that in script properties too
function testIt()
{
// get the sheet where we're staving run times
var runTimesSheet = SpreadsheetApp.openById("...").getSheetByName("times");
// we want to see true speed tests and don't want server side caching so we a copy of our data sheet
// make a copy of our data sheet and get its ID
var tempSheetID = SpreadsheetApp.openById("...").copy("temp sheet").getId();
// get script properties
var scriptProperties = PropertiesService.getScriptProperties();
// the counter
var searchCounter = Number(scriptProperties.getProperty("searchCounter"));
// index of search list we want to query for
var searchListIndex = Number(scriptProperties.getProperty("searchListIndex"));
// if we're at 0 then we need to get the index of the query string
if(searchCounter == 0)
{
searchListIndex = Math.floor(Math.random() * searchList.length);
scriptProperties.setProperty("searchListIndex", searchListIndex);
}
// query string
var query = searchList[searchListIndex];
// save relevant data
var timerRow = ["method" + (searchCounter + 1), searchListIndex, query, 0, "", "", "", ""];
// run the appropriate method
switch(searchCounter)
{
case 0:
// start time
var start = (new Date()).getTime();
// run the query
var ret = method1(tempSheetID, "Extract", 1, query);
// end time
timerRow[3] = ((new Date()).getTime() - start) / 1000;
// if we found the row save its values in the timer output so we can confirm it was found
if(ret)
{
timerRow[4] = ret[0];
timerRow[5] = ret[1];
timerRow[6] = ret[2];
timerRow[7] = ret[3];
}
break;
case 1:
var start = (new Date()).getTime();
var ret = method2(tempSheetID, "Extract", 1, query);
timerRow[3] = ((new Date()).getTime() - start) / 1000;
if(ret)
{
timerRow[4] = ret[0];
timerRow[5] = ret[1];
timerRow[6] = ret[2];
timerRow[7] = ret[3];
}
break;
case 2:
var start = (new Date()).getTime();
var ret = method3(tempSheetID, "Extract", "A", "AL", "B", query);
timerRow[3] = ((new Date()).getTime() - start) / 1000;
if(ret.table.rows.length)
{
timerRow[4] = ret.table.rows[0].c[0].v;
timerRow[5] = ret.table.rows[0].c[1].v;
timerRow[6] = ret.table.rows[0].c[2].v;
timerRow[7] = ret.table.rows[0].c[3].v;
}
break;
case 3:
var start = (new Date()).getTime();
var ret = method3(tempSheetID, "Extract", "A", "AL", "B", query);
timerRow[3] = ((new Date()).getTime() - start) / 1000;
if(ret.table.rows.length)
{
timerRow[4] = ret.table.rows[0].c[0].v;
timerRow[5] = ret.table.rows[0].c[1].v;
timerRow[6] = ret.table.rows[0].c[2].v;
timerRow[7] = ret.table.rows[0].c[3].v;
}
break;
}
// delete the temp file
DriveApp.getFileById(tempSheetID).setTrashed(true);
// save run times
runTimesSheet.appendRow(timerRow);
// start back at 0 if we're the end
if(++searchCounter == 4) searchCounter = 0;
// save the search counter
scriptProperties.setProperty("searchCounter", searchCounter);
}
I have a global variable searchList that is an array of various query strings -- some are in the sheet, some are not.
I ran testit on a trigger to run every minute. After 152 iterations I had 38 batches. Looking at the result, this is what I see for each method:
| Method | Minimum Seconds | Maximum Seconds | Average Seconds |
|---------|-----------------|-----------------|-----------------|
| method1 | 8.24 | 36.94 | 11.86 |
| method2 | 9.93 | 23.38 | 14.09 |
| method3 | 1.92 | 5.48 | 3.06 |
| method4 | 2.20 | 11.14 | 3.36 |
So it appears that, at least for my data-set, is using Google visualization query is the fastest.

JDBC ResultSet to ArrayList, ArrayList to .txt file

I got the following piece of code that retrieves all rows in a table:
String MakeTXT = "USE SRO_VT_SHARD Select * from _RefTeleLink";
pst = conn.prepareStatement(MakeTXT);
rs = pst.executeQuery();
ArrayList<String> links = new ArrayList<>();
int i = 1;
String rows = "";
while (rs.next()) {
for (i = 1; i <= 22; i++) {
links.add(rs.getString(i));
if (i == 22) {
links.add("\n");
}
}
}
rows = String.join("\t", links);
System.out.println(rows);
}
}
What I want to do is:
Select all rows from the table. See result: prnt.sc/egbh4o
Write all selected rows to a .txt file
.txt file has to look something like this (literally copy pasted the rows): http://prntscr.com/egbhn4
What my code currently outputs:
output
It does this because there are 22 columns, and when the loop reaches 22, it adds an enter to the ArrayList.
What I'm actually looking for is a way to copy an entire row using ResultSet, instead of using a for loop to loop 22 times, and make a row of the 22 results.
Have looked everywhere but couldn't find anything.. :(
You do not need an ArrayList to hold the column values as they are read. I'd better use a StringBuilder as show below, concatenating tabs inside the loop and then replacing the last one with a line feed.
String MakeTXT = "USE SRO_VT_SHARD Select * from _RefTeleLink";
Statement stm = conn.createStatement();
ResultSet rs = stm.executeQuery(MakeTXT);
List<String> rows = new ArrayList<>();
StringBuilder row = new StringBuilder();
ResultSetMetaData meta = rs.getMetaData();
final int colCount = meta.getColumnCount();
while (rs.next()) {
row.setLength(0);
for (int c=0; c<=colCount; c++)
row.append(rs.getString(c)).append("\t");
row.setCharAt(row.length()-1, '\n');
rows.add(row.toString());
}
rs.close();
stm.close();

Finding highest number in text file

I have a text file that contains 50 student names and scores for each student in the format.
foreName.Surname:Mark
I have figured out how to split up each line into a forename, surname and mark using this code.
string[] Lines = File.ReadAllLines(#"StudentExamMarks.txt");
int i = 0;
var items = from line in Lines
where i++ != 0
let words = line.Split(' ', '.', ':')
select new
{
foreName = words[0],
Surname = words[1],
Mark = words[2]
};
I am unsure of how i would incorporate a findMax algorithm into to find the highest mark and display the pupil with the highest mark. this as i have not used text files that often.
You can use any sorting algorithm there is a Pseudo Code available to find maximum number in any list or array..
Try this code, required just parse all files.
string[] lines = File.ReadAllLines(#"StudentExamMarks.txt");
string maxForeName = null;
string maxSurName = null;
var maxMark = 0;
for (int i = 0; i < lines.Length; i++)
{
var tmp = lines[i].Split(new char[] { ' ', '.', ':' }, StringSplitOptions.RemoveEmptyEntries);
if (tmp.Length == 3)
{
int value = int.Parse(tmp[2]);
if (i == 0 || value > maxMark)
{
maxMark = value;
maxForeName = tmp[0];
maxSurName = tmp[1];
}
}
}

Using appium how can we read/print complete list in a drop down in native as well as Hybrid application?

I want to print or read complete contact list using appium (suppose my .apk is a contact application and it shows complete A to Z contacts and by scrolling i can view all of the contacts).
I am able to count/print contacts shown in the first screen (i.e the screen which shows some contact say 10 contacts by default and for more i have to scroll).i have come to solution of the above and its working for some time but after that it throws an error, Please help how to rectify this issue
java.lang.IndexOutOfBoundsException: Index: 9, Size: 9
My code is `
AppiumDriver driver = null;
ArrayList values = new ArrayList();
DesiredCapabilities cap = new DesiredCapabilities();
cap.setCapability(MobileCapabilityType.DEVICE_NAME, "Galaxy S4");
cap.setCapability(MobileCapabilityType.PLATFORM_NAME, "Android");
cap.setCapability(MobileCapabilityType.PLATFORM_VERSION, "4.4.4");
cap.setCapability(MobileCapabilityType.APP_PACKAGE, "com.brainworks.contacts");
cap.setCapability(MobileCapabilityType.APP_ACTIVITY, "com.brainworks.contacts.ui.Main");
driver = new AndroidDriver(new URL("http://127.0.0.1:4723/wd/hub"),cap);
driver.manage().timeouts().implicitlyWait(10, TimeUnit.SECONDS);
List<WebElement> allContactsOnfirstScreen = driver.findElements(By.xpath("//*[#resource-id='com.brainworks.contacts:id/txt_name']"));
driver.context("NATIVE_APP");
Dimension size = driver.manage().window().getSize();
int StartY = (int)(size.height * 0.70);
int EndY = (int)(size.height * 0.55);
int StartX = size.width/2;
for(int i =0;i<200;i++){
System.out.println("Contacts are = " + allContactsOnfirstScreen.get(i).getAttribute("text"));
String Values = allContactsOnfirstScreen.get(i).getAttribute("text");
values.add(Values);
System.out.println("Value is = " + values);
String ext = Values;
String [] a = ext.split(",");
String [] b = a[a.length-1].split("//]");
System.out.println("val = " + b[0]);
driver.swipe(StartX, StartY, StartX, EndY , 1000);
}`
Second way which i have tried
AppiumDriver driver = null;
// desired cap as above in the first try
// same as above -but now what i am doing is i am printing the list in for loop
// then do a swipe form last element to first element then again print the new list
// and this cycle keeps on running until i reach the end of the list or the parameter
// in the if loop.
int XOfFirstElement = allContactsOnfirstScreen.get(0).getLocation().getX();
int YOfFirstElement = allContactsOnfirstScreen.get(0).getLocation().getY();
System.out.println("X for XOfFirstElement = " + XOfFirstElement);
System.out.println("Y for YOfFirstElement = " + YOfFirstElement);
int XOfLastElement = allContactsOnfirstScreen.get(allContactsOnfirstScreen.size()-1).getLocation().getX();
int YOfLastElement = allContactsOnfirstScreen.get(allContactsOnfirstScreen.size()-1).getLocation().getY();
System.out.println("X for XOfLastElement = " + XOfLastElement);
System.out.println("Y for YOfLastElement = " + YOfLastElement);
int XOfSecondLastElement = allContactsOnfirstScreen.get(allContactsOnfirstScreen.size()-2).getLocation().getX();
int YOfSecondLastElement = allContactsOnfirstScreen.get(allContactsOnfirstScreen.size()-2).getLocation().getY();
System.out.println("X for XOfSecondLastElement = " + XOfSecondLastElement);
System.out.println("Y for YOfSecondLastElement = " + YOfSecondLastElement);
while(true){
for(int i = 0 ;i<allContactsOnfirstScreen.size();i++){
System.out.println("Value is for " + allContactsOnfirstScreen.get(i).getAttribute("text"));
}
driver.swipe(XOfLastElement, YOfLastElement, XOfFirstElement, YOfFirstElement, 1000);
Thread.sleep(1000L);
String LastName = allContactsOnfirstScreen.get(allContactsOnfirstScreen.size()-1).getAttribute("text");
if(LastName.equals("MyLastContactText"))
break;
}
Now the problem is
1.>Every thing is working as expected but what is happening is sometime it prints the duplicate value (Contact) (may be due to coordinates for that contact do not changes after a swipe)
2. i am not able to print the last contact name every time cause my if statement in the while loop takes me out and script stops.
if anyone can help then it will be great if any new way plz also let me know Thanks in advance
I suppose, following the code snippet, that the problem in the String [] b = a[a.length-1].split("//]"); line. You probably receive 0 from a.length and this goes to the exception. Could you please add print variable a before this line?
Also if this is not the problem, could you please add full exception stack to the post?

Most Frequent 3 page sequence in a weblog

Given a web log which consists of fields 'User ' 'Page url'. We have to find out the most frequent 3-page sequence that users takes.
There is a time stamp. and it is not guaranteed that the single user access will be logged sequentially it could be like user1 Page1 user2 Pagex user1 Page2 User10 Pagex user1 Page 3 her User1s page sequence is page1-> page2-> page 3
Assuming your log is stored in timestamp order, here's an algorithm to do what you need:
Create a hashtable 'user_visits' mapping user ID to the last two pages you observed them to visit
Create a hashtable 'visit_count' mapping 3-tuples of pages to frequency counts
For each entry (user, URL) in the log:
If 'user' exists in user_visits with two entries, increment the entry in visit_count corresponding to the 3-tuple of URLs by one
Append 'URL' to the relevant entry in user_visits, removing the oldest entry if necessary.
Sort the visit_count hashtable by value. This is your list of most popular sequences of URLs.
Here's an implementation in Python, assuming your fields are space-separated:
fh = open('log.txt', 'r')
user_visits = {}
visit_counts = {}
for row in fh:
user, url = row.split(' ')
prev_visits = user_visits.get(user, ())
if len(prev_vists) == 2:
visit_tuple = prev_vists + (url,)
visit_counts[visit_tuple] = visit_counts.get(visit_tuple, 0) + 1
user_visits[user] = (prev_vists[1], url)
popular_sequences = sorted(visit_counts, key=lambda x:x[1], reverse=True)
Quick and dirty:
Build a list of url/timestamps per
user
sort each list by timestamp
iterate over each list
for each 3 URL sequence, create or increment a counter
find the highest count in the URL sequence count list
foreach(entry in parsedLog)
{
users[entry.user].urls.add(entry.time, entry.url)
}
foreach(user in users)
{
user.urls.sort()
for(i = 0; i < user.urls.length - 2; i++)
{
key = createKey(user.urls[i], user.urls[i+1], user.urls[i+2]
sequenceCounts.incrementOrCreate(key);
}
}
sequenceCounts.sortDesc()
largestCountKey = sequenceCounts[0]
topUrlSequence = parseKey(largestCountkey)
Here's a bit of SQL assuming you could get your log into a table such as
CREATE TABLE log (
ord int,
user VARCHAR(50) NOT NULL,
url VARCHAR(255) NOT NULL,
ts datetime
) ENGINE=InnoDB;
If the data is not sorted per user then (assuming that ord column is the number of the line from the log file)
SELECT t.url, t2.url, t3.url, count(*) c
FROM
log t INNER JOIN
log t2 ON t.user = t2.user INNER JOIN
log t3 ON t2.user = t3.user
WHERE
t2.ord IN (SELECT MIN(ord)
FROM log i
WHERE i.user = t.user AND i.ord > t.ord)
AND
t3.ord IN (SELECT MIN(ord)
FROM log i
WHERE i.user = t.user AND i.ord > t2.ord)
GROUP BY t.user, t.url, t2.url, t3.url
ORDER BY c DESC
LIMIT 10;
This will give top ten 3 stop paths for a user. Alternatively if you can get it ordered by user and time you can join on rownumbers more easily.
Source code in Mathematica
s= { {user},{page} } (* load List (log) here *)
sortedListbyUser=s[[Ordering[Transpose[{s[[All, 1]], Range[Length[s]]}]] ]]
Tally[Partition [sortedListbyUser,3,1]]
This problem is similar to
Find k most frequent words from a file
Here is how you can solve it :
Group each triplet (page1, page2, page3) into a word
Apply the algorithm mentioned here
1.Reads user page access urls from file line by line,these urls separated by separator,eg:
u1,/
u1,main
u1,detail
The separator is comma.
2.Store each page's visit count to map:pageVisitCounts;
3.Sort the visit count map by value in descend order;
public static Map<String, Integer> findThreeMaxPagesPathV1(String file, String separator, int depth) {
Map<String, Integer> pageVisitCounts = new HashMap<String, Integer>();
if (file == null || "".equals(file)) {
return pageVisitCounts;
}
try {
File f = new File(file);
FileReader fr = new FileReader(f);
BufferedReader bf = new BufferedReader(fr);
Map<String, List<String>> userUrls = new HashMap<String, List<String>>();
String currentLine = "";
while ((currentLine = bf.readLine()) != null) {
String[] lineArr = currentLine.split(separator);
if (lineArr == null || lineArr.length != (depth - 1)) {
continue;
}
String user = lineArr[0];
String page = lineArr[1];
List<String> urlLinkedList = null;
if (userUrls.get(user) == null) {
urlLinkedList = new LinkedList<String>();
} else {
urlLinkedList = userUrls.get(user);
String pages = "";
if (urlLinkedList.size() == (depth - 1)) {
pages = urlLinkedList.get(0).trim() + separator + urlLinkedList.get(1).trim() + separator + page;
} else if (urlLinkedList.size() > (depth - 1)) {
urlLinkedList.remove(0);
pages = urlLinkedList.get(0).trim() + separator + urlLinkedList.get(1).trim() + separator + page;
}
if (!"".equals(pages) && null != pages) {
Integer count = (pageVisitCounts.get(pages) == null ? 0 : pageVisitCounts.get(pages)) + 1;
pageVisitCounts.put(pages, count);
}
}
urlLinkedList.add(page);
System.out.println("user:" + user + ", urlLinkedList:" + urlLinkedList);
userUrls.put(user, urlLinkedList);
}
bf.close();
fr.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return pageVisitCounts;
}
public static void main(String[] args) {
String file = "/home/ieee754/Desktop/test-access.log";
String separator = ",";
Map<String, Integer> pageVisitCounts = findThreeMaxPagesPathV1(file, separator, 3);
System.out.println(pageVisitCounts.size());
Map<String, Integer> result = MapUtil.sortByValueDescendOrder(pageVisitCounts);
System.out.println(result);
}

Resources