Oracle regular expression for Alphanumeric check - oracle

We have below table structure.
From To Output
A001 A555 "A"
A556 A999 "B"
AA01 AA55 "C"
AA56 AA99 "D"
B001 B555 "C"
If my input value if greater than From column and less than To column then return that Output. E.g. If I received A222 then return output as "A". If I received AA22 then return output as "C".
Currently I have handle this in Java but wanted to check if I can write a Oracle query(using Regex) to get above output.
Thanks
Sach

Create your table with the appropriate (virtual) columns and indexes:
Oracle Setup:
CREATE TABLE table_name (
"from" VARCHAR2(10) UNIQUE,
"to" VARCHAR2(10) UNIQUE,
output CHAR(1) NOT NULL,
prefix VARCHAR2(9) GENERATED ALWAYS AS (
CAST(
REGEXP_SUBSTR( "from", '^\D+' )
AS VARCHAR2(9)
)
) VIRTUAL,
from_postfix NUMBER(9) GENERATED ALWAYS AS (
TO_NUMBER( REGEXP_SUBSTR( "from", '\d+$' ) )
) VIRTUAL,
to_postfix NUMBER(9) GENERATED ALWAYS AS (
TO_NUMBER( REGEXP_SUBSTR( "to", '\d+$' ) )
) VIRTUAL,
CONSTRAINT table_name__from_to__u PRIMARY KEY (
prefix, from_postfix, to_postfix
),
CONSTRAINT table_name__f_t_prefix__chk CHECK (
REGEXP_SUBSTR( "from", '\^\D+' ) = REGEXP_SUBSTR( "to", '\^\D+' )
)
);
INSERT INTO table_name ( "from", "to", output )
SELECT 'A001', 'A555', 'A' FROM DUAL UNION ALL
SELECT 'A556', 'A999', 'B' FROM DUAL UNION ALL
SELECT 'AA01', 'AA55', 'C' FROM DUAL UNION ALL
SELECT 'AA56', 'AA99', 'D' FROM DUAL UNION ALL
SELECT 'B001', 'B555', 'C' FROM DUAL;
COMMIT;
Query:
Then your query can use the index and not need to do a full table scan:
SELECT output
FROM table_name
WHERE REGEXP_SUBSTR( :input, '^\D+' ) = prefix
AND TO_NUMBER( REGEXP_SUBSTR( :input, '\d+$' ) )
BETWEEN from_postfix
AND to_postfix;
Output:
If the input bind variable is AA22 then the result is:
OUTPUT
------
C
Explain Plan:
PLAN_TABLE_OUTPUT
-------------------------------------------
Plan hash value: 2408507965
------------------------------------------------------------------------------------------------------
| Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time |
------------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | 1 | 35 | 2 (0)| 00:00:01 |
| 1 | TABLE ACCESS BY INDEX ROWID| TABLE_NAME | 1 | 35 | 2 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | TABLE_NAME__FROM_TO__U | 1 | | 2 (0)| 00:00:01 |
------------------------------------------------------------------------------------------------------
Predicate Information (identified by operation id):
---------------------------------------------------
2 - access("PREFIX"= REGEXP_SUBSTR (:INPUT,'^\D+') AND "TO_POSTFIX">=TO_NUMBER(
REGEXP_SUBSTR (:INPUT,'\d+$')) AND "FROM_POSTFIX"<=TO_NUMBER( REGEXP_SUBSTR (:INPUT,'\d+$')))
filter("TO_POSTFIX">=TO_NUMBER( REGEXP_SUBSTR (:INPUT,'\d+$')))

You have to extract varchar and number value from columns and input value and compare it each other
with tabl("FROM","TO",Output) as (
select 'A001','A555','A' from dual union all
select 'A556','A999','B' from dual union all
select 'AA01','AA55','C' from dual union all
select 'AA56','AA99','D' from dual union all
select 'B001','B555','C' from dual)
select * from tabl
where to_number(regexp_substr('AA22','[0-9]+')) between to_number(regexp_substr("FROM",'[0-9]+')) and to_number(regexp_substr("TO",'[0-9]+'))
and regexp_substr('AA22','[^0-9]+') = regexp_substr("FROM",'[^0-9]+')

Related

ORACLE, ignore null to sum

I want to add two columns(number type) of a row, together but when one of those columns is null then the result is null. I handlled it by NVL function(NVL(col1,0) + NVL(col2,0)). But i want to return null when both are null(instead of 0).
how can i handle it?
Is there function in ORACLE that ignore null to sum two columns of a row?
| col1 | col2 | |
|:---- |:-----:|:-----------------------------------:|
| 1 | 2 | --> result have to be : 3 |
| 1 | null | --> result have to be : 1 |
| null | null | --> result have to be : null |
coalesce(col1+col2,col1,col2) would be easier:
with t(col1,col2) as (
select 0,1 from dual union all
select 2,null from dual union all
select null,3 from dual union all
select null,null from dual
)
select
col1,col2,
coalesce(col1+col2,col1,col2) sum_cols
from t;
Results:
COL1 COL2 SUM_COLS
---------- ---------- ----------
0 1 1
2 null 2
null 3 3
null null null
4 rows selected.
or subquery with sum if you have more columns to sum:
(select sum(column_value) from table(sys.odcinumberlist(col1,col2,...,colN)))
Example:
with t(col1,col2) as (
select 0,1 from dual union all
select 2,null from dual union all
select null,3 from dual union all
select null,null from dual
)
select
col1,col2,
coalesce(col1+col2,col1,col2) sum_cols,
(select sum(column_value) from table(sys.odcinumberlist(col1,col2))) sum_cols2
from t;
DBFiddle:
https://dbfiddle.uk/?rdbms=oracle_18&fiddle=eda8de9746f1d0def3c290420adbb705
A CASE expression might be the easiest way to go here:
SELECT CASE WHEN col1 IS NULL AND col2 IS NULL
THEN NULL
ELSE NVL(col1, 0) + NVL(col2, 0) END AS output
FROM yourTable;

Oracle - how to update a unique row based on MAX effective date which is part of the unique index

Oracle - Say you have a table that has a unique key on name, ssn and effective date. The effective date makes it unique. What is the best way to update a current indicator to show inactive for the rows with dates less than the max effective date? I can't really wrap my head around it since there are multiple rows with the same name and ssn combinations. I haven't been able to find this scenario on here for Oracle and I'm having developer's block. Thanks.
"All name/ssn having a max effective date earlier than this time yesterday:"
SELECT name, ssn
FROM t
GROUP BY name, ssn
HAVING MAX(eff_date) < SYSDATE - 1
Oracle supports multi column in, so
UPDATE t
SET current_indicator = 'inactive'
WHERE (name,ssn,eff_date) IN (
SELECT name, ssn, max(eff_date)
FROM t
GROUP BY name, ssn
HAVING MAX(eff_date) < SYSDATE - 1
)
Use a MERGE statement using an analytic function to identify the rows to update and then merge on the ROWID pseudo-column so that Oracle can efficiently identify the rows to update (without having to perform an expensive self-join by comparing the values):
MERGE INTO table_name dst
USING (
SELECT rid,
max_eff_date
FROM (
SELECT ROWID AS rid,
effective_date,
status,
MAX( effective_date ) OVER ( PARTITION BY name, ssn ) AS max_eff_date
FROM table_name
)
WHERE ( effective_date < max_eff_date AND status <> 'inactive' )
OR ( effective_date = max_eff_date AND status <> 'active' )
) src
ON ( dst.ROWID = src.rid )
WHEN MATCHED THEN
UPDATE
SET status = CASE
WHEN src.max_eff_date = dst.effective_date
THEN 'active'
ELSE 'inactive'
END;
So, for some sample data:
CREATE TABLE table_name ( name, ssn, effective_date, status ) AS
SELECT 'aaa', 1, DATE '2020-01-01', 'inactive' FROM DUAL UNION ALL
SELECT 'aaa', 1, DATE '2020-01-02', 'inactive' FROM DUAL UNION ALL
SELECT 'aaa', 1, DATE '2020-01-03', 'inactive' FROM DUAL UNION ALL
SELECT 'bbb', 2, DATE '2020-01-01', 'active' FROM DUAL UNION ALL
SELECT 'bbb', 2, DATE '2020-01-02', 'inactive' FROM DUAL UNION ALL
SELECT 'bbb', 3, DATE '2020-01-01', 'inactive' FROM DUAL UNION ALL
SELECT 'bbb', 3, DATE '2020-01-03', 'active' FROM DUAL;
The query only updates the 3 rows that need changing and:
SELECT *
FROM table_name;
Outputs:
NAME | SSN | EFFECTIVE_DATE | STATUS
:--- | --: | :------------- | :-------
aaa | 1 | 01-JAN-20 | inactive
aaa | 1 | 02-JAN-20 | inactive
aaa | 1 | 03-JAN-20 | active
bbb | 2 | 01-JAN-20 | inactive
bbb | 2 | 02-JAN-20 | active
bbb | 3 | 01-JAN-20 | inactive
bbb | 3 | 03-JAN-20 | active
db<>fiddle here

calculate the average time difference between each stage

How to calculate the average time difference between each stage.
The challenge with the actual data set is not every id will go through all stages.. some will skip stages and the date is not continuous for all Id's like below.
id date status
1 1/1/18 requirement
1 1/8/18 analysis
1 ? design
1 1/30/18 closed
2 2/1/18 requirement
2 2/18/18 closed
3 1/2/18 requirement
3 1/29/18 analysis
3 ? accepted
3 2/5/18 closed
?--we have missing dates as well
Expected output
id date status time_spent
1 1/1/18 requirement 0
1 1/8/18 analysis 7
1 ? design
1 1/30/18 closed 22
2 2/1/18 requirement 0
2 2/18/18 closed 17
3 1/2/18 requirement 0
3 1/29/18 analysis 27
3 ? accepted
3 2/5/18 closed 24
status avg(timespent)
requirement 0
analysis 17
design
closed 21
You can use windowing functions LAG (or LEAD) to get the data of the previous (or next) status for each id. That will let you compute the time elapsed in each stage. Then, compute the average time elapsed for each stage.
Here is an example of how to do that:
with input_data (id, dte, status) as (
SELECT 1, TO_DATE('1/1/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 1, TO_DATE('1/8/18','MM/DD/YY'), 'analysis' FROM DUAL UNION ALL
SELECT 1, NULL, 'design' FROM DUAL UNION ALL
SELECT 1, TO_DATE('1/30/18','MM/DD/YY'), 'closed' FROM DUAL UNION ALL
SELECT 2, TO_DATE('2/1/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 2, TO_DATE('2/18/18','MM/DD/YY'), 'closed' FROM DUAL UNION ALL
SELECT 3, TO_DATE('1/2/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 3, TO_DATE('1/29/18','MM/DD/YY'), 'analysis' FROM DUAL UNION ALL
SELECT 3, NULL, 'accepted' FROM DUAL UNION ALL
SELECT 3, TO_DATE('2/5/18','MM/DD/YY'), 'closed' FROM DUAL ),
----- Solution begins here
data_with_elapsed_days as (
SELECT id.*, dte-nvl(lag(dte ignore nulls) over ( partition by id order by dte ), dte) elapsed
from input_data id)
SELECT status, avg(elapsed)
FROM data_with_elapsed_days d
group by status
order by decode(status,'requirement',1,'analysis',2,'design',3,'accepted',4,'closed',5,99);
+-------------+-------------------------------------------+
| STATUS | AVG(ELAPSED) |
+-------------+-------------------------------------------+
| requirement | 0 |
| analysis | 17 |
| design | |
| accepted | |
| closed | 15.33333333333333333333333333333333333333 |
+-------------+-------------------------------------------+
As I said in my comment, that logic computes the elapsed days as the time to the given status from the prior status. Since, "requirement" has no prior status, this logic will always show zero days spent in requirements. It would probably be better to compute the time from the given status to the next status. For "closed", there would be no next status. You could just leave that blank or use SYSDATE as the data of the next status. Here is an example of that:
with input_data (id, dte, status) as (
SELECT 1, TO_DATE('1/1/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 1, TO_DATE('1/8/18','MM/DD/YY'), 'analysis' FROM DUAL UNION ALL
SELECT 1, NULL, 'design' FROM DUAL UNION ALL
SELECT 1, TO_DATE('1/30/18','MM/DD/YY'), 'closed' FROM DUAL UNION ALL
SELECT 2, TO_DATE('2/1/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 2, TO_DATE('2/18/18','MM/DD/YY'), 'closed' FROM DUAL UNION ALL
SELECT 3, TO_DATE('1/2/18','MM/DD/YY'), 'requirement' FROM DUAL UNION ALL
SELECT 3, TO_DATE('1/29/18','MM/DD/YY'), 'analysis' FROM DUAL UNION ALL
SELECT 3, NULL, 'accepted' FROM DUAL UNION ALL
SELECT 3, TO_DATE('2/5/18','MM/DD/YY'), 'closed' FROM DUAL ),
----- Solution begins here
data_with_elapsed_days as (
SELECT id.*, nvl(lead(dte ignore nulls) over ( partition by id order by dte ), trunc(sysdate))-dte elapsed
from input_data id)
SELECT status, avg(elapsed)
FROM data_with_elapsed_days d
group by status
order by decode(status,'requirement',1,'analysis',2,'design',3,'accepted',4,'closed',5,99);
+-------------+------------------------------------------+
| STATUS | AVG(ELAPSED) |
+-------------+------------------------------------------+
| requirement | 17 |
| analysis | 14.5 |
| design | |
| accepted | |
| closed | 361.666666666666666666666666666666666667 |
+-------------+------------------------------------------+
I agree with #MatthewMcPeak. Your requirements seem a bit odd: you spend zero days of requirement stage but spend an average of 21 days on closed? Fnord.
This solution treats the presented date as the start date of the stage and calculates the difference between it and the start_date of the next phase.
with cte as (
select status
, lead(dd ignore nulls) over (partition by id order by dd) - dd as dt_diff
from your_table)
select status, avg(dt_diff) as avg_ela
from cte
group by status
/
If you wish to include all stages for each d and estimate the time spent in each (using linear interpolation) then you can create a sub-query with all the statuses and use a PARTITION OUTER JOIN to join them and then use LAG and LEAD to find the date range the status is in and interpolate between:
Oracle Setup:
CREATE TABLE data ( d, dt, status ) AS
SELECT 1, TO_DATE( '1/1/18', 'MM/DD/YY' ), 'requirement' FROM DUAL UNION ALL
SELECT 1, TO_DATE( '1/8/18', 'MM/DD/YY' ), 'analysis' FROM DUAL UNION ALL
SELECT 1, NULL, 'design' FROM DUAL UNION ALL
SELECT 1, TO_DATE( '1/30/18', 'MM/DD/YY' ), 'closed' FROM DUAL UNION ALL
SELECT 2, TO_DATE( '2/1/18', 'MM/DD/YY' ), 'requirement' FROM DUAL UNION ALL
SELECT 2, TO_DATE( '2/18/18', 'MM/DD/YY' ), 'closed' FROM DUAL UNION ALL
SELECT 3, TO_DATE( '1/2/18', 'MM/DD/YY' ), 'requirement' FROM DUAL UNION ALL
SELECT 3, TO_DATE( '1/29/18', 'MM/DD/YY' ), 'analysis' FROM DUAL UNION ALL
SELECT 3, NULL, 'accepted' FROM DUAL UNION ALL
SELECT 3, TO_DATE( '2/5/18', 'MM/DD/YY' ), 'closed' FROM DUAL;
Query:
WITH statuses ( status, id ) AS (
SELECT 'requirement', 1 FROM DUAL UNION ALL
SELECT 'analysis', 2 FROM DUAL UNION ALL
SELECT 'design', 3 FROM DUAL UNION ALL
SELECT 'accepted', 4 FROM DUAL UNION ALL
SELECT 'closed', 5 FROM DUAL
),
ranges ( d, dt, status, id, recent_dt, recent_id, next_dt, next_id ) AS (
SELECT d.d,
d.dt,
s.status,
s.id,
NVL(
d.dt,
LAG( d.dt, 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
),
NVL2(
d.dt,
s.id,
LAG( CASE WHEN d.dt IS NOT NULL THEN s.id END, 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
),
LEAD( d.dt, 1, d.dt )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id ),
LEAD( CASE WHEN d.dt IS NOT NULL THEN s.id END, 1, s.id + 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
FROM data d
PARTITION BY ( d )
RIGHT OUTER JOIN statuses s
ON ( d.status = s.status )
)
SELECT d,
dt,
status,
( next_dt - recent_dt ) / (next_id - recent_id ) AS estimated_duration
FROM ranges;
Output:
D | DT | STATUS | ESTIMATED_DURATION
-: | :-------- | :---------- | ---------------------------------------:
1 | 01-JAN-18 | requirement | 7
1 | 08-JAN-18 | analysis | 7.33333333333333333333333333333333333333
1 | null | design | 7.33333333333333333333333333333333333333
1 | null | accepted | 7.33333333333333333333333333333333333333
1 | 30-JAN-18 | closed | 0
2 | 01-FEB-18 | requirement | 4.25
2 | null | analysis | 4.25
2 | null | design | 4.25
2 | null | accepted | 4.25
2 | 18-FEB-18 | closed | 0
3 | 02-JAN-18 | requirement | 27
3 | 29-JAN-18 | analysis | 2.33333333333333333333333333333333333333
3 | null | design | 2.33333333333333333333333333333333333333
3 | null | accepted | 2.33333333333333333333333333333333333333
3 | 05-FEB-18 | closed | 0
Query 2:
Then of you can easily change that to take the average for each status:
WITH statuses ( status, id ) AS (
SELECT 'requirement', 1 FROM DUAL UNION ALL
SELECT 'analysis', 2 FROM DUAL UNION ALL
SELECT 'design', 3 FROM DUAL UNION ALL
SELECT 'accepted', 4 FROM DUAL UNION ALL
SELECT 'closed', 5 FROM DUAL
),
ranges ( d, dt, status, id, recent_dt, recent_id, next_dt, next_id ) AS (
SELECT d.d,
d.dt,
s.status,
s.id,
NVL(
d.dt,
LAG( d.dt, 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
),
NVL2(
d.dt,
s.id,
LAG( CASE WHEN d.dt IS NOT NULL THEN s.id END, 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
),
LEAD( d.dt, 1, d.dt )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id ),
LEAD( CASE WHEN d.dt IS NOT NULL THEN s.id END, 1, s.id + 1 )
IGNORE NULLS OVER ( PARTITION BY d.d ORDER BY s.id )
FROM data d
PARTITION BY ( d )
RIGHT OUTER JOIN statuses s
ON ( d.status = s.status )
)
SELECT status,
AVG( ( next_dt - recent_dt ) / (next_id - recent_id ) ) AS estimated_duration
FROM ranges
GROUP BY status, id
ORDER BY id;
Results:
STATUS | ESTIMATED_DURATION
:---------- | ---------------------------------------:
requirement | 12.75
analysis | 4.63888888888888888888888888888888888889
design | 4.63888888888888888888888888888888888889
accepted | 4.63888888888888888888888888888888888889
closed | 0
db<>fiddle here

How to use Oracle's LISTAGG function with a multi values?

I have an 'ITEMS' table like below:
ITEM_NO ITEM_NAME
1 Book
2 Pen
3 Sticky Notes
4 Ink
5 Corrector
6 Ruler
In another 'EMP_ITEMS' table I have the below:
EMPLOYEE ITEMS_LIST
John 1,2
Mikel 5
Sophia 2,3,6
William 3,4
Daniel null
Michael 6
The output has to be like this:
EMPLOYEE ITEMS_LIST ITEM_NAME
John 1,2 Book,Pen
Mikel 5 Corrector
Sophia 2,3,6 Pen,Sticky Notes,Ruler
William 3,4 Sticky Notes,Ink
Daniel null null
Michael 6 Ruler
I used the below query:
SELECT e.EMPLOYEE,e.ITEMS_LIST, LISTAGG(i.ITEM_NAME, ',') WITHIN GROUP (ORDER BY i.ITEM_NAME) ITEM_DESC
FROM EMP_ITEMS e
INNER JOIN ITEMS i ON i.ITEM_NO = e.ITEMS_LIST
GROUP BY e.EMPLOYEE,e.ITEMS_LIST;
But there is an error:
ORA-01722: invalid number
But there is an error: ORA-01722: invalid number
That is because your ITEMS_LIST is a string composed of numeric and comma characters and is not actually a list of numbers and you are trying to compare a single item number to a list of items.
Instead treat it as a string a look for sub-string matches. To do this you will need to surround the strings in the delimiter character and compare to see if one is the substring of the other:
SQL Fiddle
Oracle 11g R2 Schema Setup:
CREATE TABLE Items ( ITEM_NO, ITEM_NAME ) As
SELECT 1, 'Book' FROM DUAL UNION ALL
SELECT 2, 'Pen' FROM DUAL UNION ALL
SELECT 3, 'Sticky Notes' FROM DUAL UNION ALL
SELECT 4, 'Ink' FROM DUAL UNION ALL
SELECT 5, 'Corrector' FROM DUAL UNION ALL
SELECT 6, 'Ruler' FROM DUAL;
CREATE TABLE emp_items ( EMPLOYEE, ITEMS_LIST ) AS
SELECT 'John', '1,2' FROM DUAL UNION ALL
SELECT 'Mikel', '5' FROM DUAL UNION ALL
SELECT 'Sophia', '3,2,6' FROM DUAL UNION ALL
SELECT 'William', '3,4' FROM DUAL UNION ALL
SELECT 'Daniel', null FROM DUAL UNION ALL
SELECT 'Michael', '6' FROM DUAL;
Query 1:
SELECT e.employee,
e.items_list,
LISTAGG( i.item_name, ',' )
WITHIN GROUP (
ORDER BY INSTR( ','||e.items_list||',', ','||i.item_no||',' )
) AS item_names
FROM emp_items e
LEFT OUTER JOIN
items i
ON ( ','||e.items_list||',' LIKE '%,'||i.item_no||',%' )
GROUP BY e.employee, e.items_list
Results:
| EMPLOYEE | ITEMS_LIST | ITEM_NAMES |
|----------|------------|------------------------|
| John | 1,2 | Book,Pen |
| Mikel | 5 | Corrector |
| Daniel | (null) | (null) |
| Sophia | 3,2,6 | Sticky Notes,Pen,Ruler |
| Michael | 6 | Ruler |
| William | 3,4 | Sticky Notes,Ink |

match words in all rows in same column

There is a column in table 'mytable' named 'Description'.
+----+-------------------------------+
| ID | Description |
+----+-------------------------------+
| 1 | My NAME is Sajid KHAN |
| 2 | My Name is Ahmed Khan |
| 3 | MY friend name is Salman Khan |
+----+-------------------------------+
I need to write an Oracle SQL query/procedure/function to list the distinct words in the column.
The output should be:
+------------------+-------+
| Word | Count |
+------------------+-------+
| MY | 3 |
| NAME | 3 |
| IS | 3 |
| SAJID | 1 |
| KHAN | 3 |
| AHMED | 1 |
| FRIEND | 1 |
| SALMAN | 1 |
+------------------+-------+
Word matching should be case-insensitive.
I am using Oracle 12.1.
Let's suppose we would somehow manage to split every description in words.
So, instead of single row with Id = 1 and Description = 'My NAME is Sajid KHAN' we'd have 5 rows like this
ID | Description
--- | ------------
1 | My
1 | NAME
1 | is
1 | Sajid
1 | KHAN
in this form it'd be trivial, something like
select Description, count(*) from data_in_new_form group by Description
So, let's do this using recursive query.
create table mytable
as
select 1 as ID, 'My NAME is Sajid KHAN' as Description from dual
union all
select 2, 'My Name is Ahmed Khan' from dual
union all
select 3, 'MY friend name is Salman Khan' from dual
union all
select 4, 'test, punctuation! it is' from dual
;
with
rec (id, str, depth, element_value) as
(
-- Anchor member.
select id, upper(Description) as str, 1 as depth, REGEXP_SUBSTR( upper(Description), '(.*?)( |$)', 1, 1, NULL, 1 ) AS element_value
from mytable
UNION ALL
-- Recursive member.
select id, str, depth + 1, REGEXP_SUBSTR( str ,'(.*?)( |$)', 1, depth+1, NULL, 1 ) AS element_value
from rec
where depth < regexp_count(str, ' ')+1
)
, data as (
select * from rec
--order by id, depth
)
select element_value, count(*) from data
group by element_value
order by element_value
;
Please notice this version doesn't do anything about punctuation assuming words are separated with spaces.
UPDATE alternative way using hierarchic query
with rec as
(
SELECT id, LEVEL AS depth,
REGEXP_SUBSTR( upper(description) ,'(.*?)( |$)', 1, LEVEL, NULL, 1 ) AS element_value
FROM mytable
CONNECT BY LEVEL <= regexp_count(description, ' ')+1
and prior id = id
and prior SYS_GUID() is not null
)
, data as (
select * from rec
--order by id, depth
)
select element_value, count(*) from data
group by element_value
order by 2 desc
;
This query will work. The ordering of the words may be different. However, frequent words come at the beginning as you have listed.
SELECT word,
COUNT(*)
FROM
(SELECT TRIM (REGEXP_SUBSTR (Description, '[^ ]+', 1, ROWNUM) ) AS Word
FROM
(SELECT LISTAGG(UPPER(Description),' ') within GROUP(
ORDER BY ROWNUM ) AS Description
FROM mytable
)
CONNECT BY LEVEL <= REGEXP_COUNT ( Description, '[^ ]+')
)
GROUP BY WORD
ORDER BY 2 DESC;

Resources