Parse Values with Delimiters Using REGEXP_SUPSTR in Oracle 10g - oracle

I have a table called TVL_DETAIL that contains column TVL_CD_LIST. Column TVL_CD_LIST contains three records:
TVL_CD_LIST:
M1180_Z6827
K5900_Z6828
I2510
I've used the following code in an attempt to return the values only(so excluding the underscore):
SELECT
TVL_CD_LIST
FROM TVL_DETAIL
WHERE TVL_CD_LIST IN (SELECT regexp_substr(TVL_CD_LIST,'[^_]+', 1, level) FROM DUAL
CONNECT BY regexp_substr(TVL_CD_LIST,'[^_]+', 1, level) IS NOT NULL)
What I was expecting to see returned in separate rows was:
M1180
Z6827
K5900
Z6828
I2510
But it only returns I2510(which is the original value that doesn't contain an underscore).
What am I doing wrong? Any help is appreciated. Thanks!

To answer your question, you are querying for the list where it matches a sub-element and that will only happen where the list is comprised of one element. What you really wanted to select are the sub-elements themselves.
Note: Explanation of why parsing strings using the regex form '[^_]+' is bad here: https://stackoverflow.com/a/31464699/2543416
You want to parse the list, selecting the elements:
SQL> with TVL_DETAIL(TVL_CD_LIST) as (
select 'M1180_Z6827' from dual union
select 'K5900_Z6828' from dual union
select 'I2510' from dual
)
SELECT distinct regexp_substr(TVL_CD_LIST, '(.*?)(_|$)', 1, level, NULL, 1) element
FROM TVL_DETAIL
CONNECT BY level <= LENGTH(regexp_replace(TVL_CD_LIST, '[^_]', '')) + 1;
-- 11g CONNECT BY level <= regexp_count(TVL_CD_LIST, '_') + 1;
ELEMENT
-----------
Z6827
K5900
M1180
I2510
Z6828
SQL>
And this is cool if you want to track by row and element within row:
SQL> with TVL_DETAIL(row_nbr, TVL_CD_LIST) as (
select 1, 'M1180_Z6827' from dual union
select 2, 'K5900_Z6828' from dual union
select 3, 'I2510' from dual
)
SELECT row_nbr, column_value substring_nbr,
regexp_substr(TVL_CD_LIST, '(.*?)(_|$)', 1, column_value, NULL, 1) element
FROM TVL_DETAIL,
TABLE(
CAST(
MULTISET(SELECT LEVEL
FROM dual
CONNECT BY level <= LENGTH(regexp_replace(TVL_CD_LIST, '[^_]', '')) + 1
-- 11g CONNECT BY LEVEL <= REGEXP_COUNT(TVL_CD_LIST, '_')+1
) AS sys.OdciNumberList
)
)
order by row_nbr, substring_nbr;
ROW_NBR SUBSTRING_NBR ELEMENT
---------- ------------- -----------
1 1 M1180
1 2 Z6827
2 1 K5900
2 2 Z6828
3 1 I2510
SQL>
EDIT: Oops, edited to work with 10g as REGEXP_COUNT is not available until 11g.

The query you have used creates the list but you are comparing the list of record with column it self using the in clause, as such M1180 or Z6827 cannot be equal to M1180_Z6827 and so for K5900_Z6828. I2510 has only one value so it gets matched.
You can use below query if your requirement is exactly what you have mentioned in your desired output.
SQL> WITH tvl_detail AS
2 (SELECT 'M1180_Z6827' tvl_cd_list FROM dual
3 UNION ALL
4 SELECT 'K5900_Z6828' FROM dual
5 UNION ALL
6 SELECT 'I2510' FROM dual)
7 ---------------------------
8 --- End of data preparation
9 ---------------------------
10 SELECT regexp_substr(tvl_cd_list, '[^_]+', 1, LEVEL) AS tvl_cd_list
11 FROM tvl_detail
12 CONNECT BY regexp_substr(tvl_cd_list, '[^_]+', 1, LEVEL) IS NOT NULL
13 AND PRIOR tvl_cd_list = tvl_cd_list
14 AND PRIOR sys_guid() IS NOT NULL;
OUTPUT:
TVL_CD_LIST
--------------------------------------------
I2510
K5900
Z6828
M1180
Z6827

Related

Extract Data Between Brackets in Oracle SQL

I have a table with data in below format
COURSE
[]
["12345"]
["12345","7890"
I want to extract the data between [] but without "
So, my output would be in below format
COURSE
12345
12345, 7890
I tried the below code which works fine for first 3 rows
select REGEXP_SUBSTR (COURSE,
'"([^"]+)"',
1,
1,
NULL,
1) from TEST;
But 4th row only results in 12345.
Why not simple translate?
SQL> with test (course) as
2 (select '[]' from dual union
3 select null from dual union
4 select '["12345"]' from dual union
5 select '["12345","7890"]' from dual
6 )
7 select course,
8 translate(course, 'a[]"', 'a') result
9 from test;
COURSE RESULT
---------------- -----------------------------------------
["12345","7890"] 12345,7890
["12345"] 12345
[]
SQL>

Regular Expression with Connect by

I have this query:
select regexp_substr('1,2,3,4,5','[^,]+',1,level)
from test1
connect by instr('1,2,3,4,5',',',1,level-1)>0;
Please help me to understand this query especially the use of a level and connect by and level-1.
LEVEL and CONNECT BY are used to generate sequences, see this simple example:
select level
from dual
connect by level < 10;
LEVEL
=======
1
2
3
4
5
6
7
8
9
instr('1,2,3,4,5', ',' , 1, level-1) > 0 counts the number of elements separated by comma maybe this version is easier to understand, it does the same:
select regexp_substr('1,2,3,4,5','[^,]+',1,level)
from dual
connect by LEVEL <= REGEXP_COUNT('1,2,3,4,5', ',')+1;
LEVEL <= REGEXP_COUNT('1,2,3,4,5', ',')+1 or instr('1,2,3,4,5', ',' , 1, level-1) > 0 means "run five times".
So, your select is basically a shortcut for
select regexp_substr('1,2,3,4,5','[^,]+', 1, 1) from dual union all
select regexp_substr('1,2,3,4,5','[^,]+', 1, 2) from dual union all
select regexp_substr('1,2,3,4,5','[^,]+', 1, 3) from dual union all
select regexp_substr('1,2,3,4,5','[^,]+', 1, 4) from dual union all
select regexp_substr('1,2,3,4,5','[^,]+', 1, 5) from dual;
#Wernfried's answer is excellent, but you should know there is a big risk with the regex of the format '[^,]+' which one sees often as an example of how to parse delimited strings.
This only works when all elements of the list are present. For an eye-opener, try it with the 2nd element as NULL:
select regexp_substr('1,,3,4,5', '[^,]+', 1, 3) from dual;
REGEXP_SUBSTR('1,,3,4,5','[^,]+',1,3)
-------------------------------------
4
1 row selected.
WHAT? '4' is most definitely NOT the 3rd element of that list! As you can see the NULL is not handled.
Please use this format of REGEXP_SUBSTR which does handle NULL list elements:
select regexp_substr('1,,3,4,5','(.*?)(,|$)', 1, 3, NULL, 1) from dual;
REGEXP_SUBSTR('1,,3,4,5','(.*?)(,|$)',1,3,NULL,1)
-------------------------------------------------
3
1 row selected.
The regex defines defines 2 groups, an optional set of any characters followed by a comma or the end of the line. The arguments to REGEXP_SUBSTR
say to return the 1st group of the 3rd instance of this match.

How to select second split of column data from oracle database

I want to select the data from a Oracle table, whereas the table columns contains the data as , [ex : key,value] separated values; so here I want to select the second split i.e, value
table column data as below :
column_data
++++++++++++++
asper,worse
tincher,good
golder
null -- null values need to eliminate while selection
www,ewe
from the above data, desired output like below:
column_data
+++++++++++++
worse
good
golder
ewe
Please help me with the query
According to data you provided, here are two options:
result1: regular expressions one (get the 2nd word if it exists; otherwise, get the 1st one)
result2: SUBSTR + INSTR combination
SQL> with test (col) as
2 (select 'asper,worse' from dual union all
3 select 'tincher,good' from dual union all
4 select 'golder' from dual union all
5 select null from dual union all
6 select 'www,ewe' from dual
7 )
8 select col,
9 nvl(regexp_substr(col, '\w+', 1, 2), regexp_substr(col, '\w+', 1,1 )) result1,
10 --
11 nvl(substr(col, instr(col, ',') + 1), col) result2
12 from test
13 where col is not null;
COL RESULT1 RESULT2
------------ -------------------- --------------------
asper,worse worse worse
tincher,good good good
golder golder golder
www,ewe ewe ewe
SQL>

Oracle: find the largest number within one string

I have some strings within a column of a table like asdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd. I am wondering how may I extract the largest number within such a string in each row. For example, here the largest number is 188 (out of 188, 98 and 78).
Since the numbers I'm interested in are always right after AB, I was thinking about using regexp_substr. Unfortunately, I'm not sure how to have it output multiple rows so that I can use max clause. PLSQL language would be great as well. Please show me a simple example if you have an idea. Thank you in advance!
You could tokenize the string into all its number components, and then find the maximum:
select max(to_number(
regexp_substr('sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd', '(\d+)', 1, level))
) as max_value
from dual
connect by regexp_substr('sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd', '(\d+)', 1, level)
is not null;
MAX_VALUE
----------
188
or
select max(to_number(
regexp_substr('sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd', '(\d+)', 1, level, null, 1))
) as max_value
from dual
connect by level <= regexp_count('sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd', '\d+');
MAX_VALUE
----------
188
If you need to get values from multiple rows you need the connect-by to match the IDs, and also need to include a reference to a non-deterministic function to prevent looping; with two values in a CTE:
with your_table (id, str) as (
select 1, 'sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd' from dual
union all select 2, '123abc456abc78d9' from dual
)
select id, max(to_number(regexp_substr(str, '(\d+)', 1, level, null, 1))) as max_value
from your_table
connect by prior id = id
and prior dbms_random.value is not null
and level <= regexp_count(str, '\d+')
group by id;
ID MAX_VALUE
---------- ----------
1 188
2 456
Alternatively (to Alex' answer), if there are multiple rows:
SQL> with your_table (id, str) as (
2 select 1, 'sdfAB98:(hjkl,)AB188(uiop)uuuAB78:jknd' from dual
3 union all select 2, '123abc456abc78d9' from dual
4 )
5 select id, max(to_number(regexp_substr(str, '\d+', 1, column_value))) max_num
6 from your_table,
7 table(cast(multiset(select level from dual
8 connect by level <= regexp_count(str, '\d+')
9 ) as sys.odcinumberlist))
10 group by id;
ID MAX_NUM
---------- ----------
1 188
2 456
SQL>

remove a varchar2 string from the middle of table data values

Data in the file_name field of the generation table should be an assigned number, then _01, _02, or _03, etc. and then .pdf (example 82617_01.pdf).
Somewhere, the program is putting a state name and sometimes a date/time stamp, between the assigned number and the 01, 02, etc. (82617_ALABAMA_01.pdf or 19998_MAINE_07-31-2010_11-05-59_AM.pdf or 5485325_OREGON_01.pdf for example).
We would like to develop a SQL statement to find the bad file names and fix them. In theory it seems rather simple to find file names that include a varchar2 data type and remove it, but putting the statement together is beyond me.
Any help or suggestions appreciated.
Something like:
UPDATE GENERATION
SET FILE_NAME (?)
WHERE FILE_NAME (?...LIKE '%STRING%');?
You can find the problem rows like this:
select *
from Files
where length(FILE_NAME) - length(replace(FILE_NAME, '_', '')) > 1
You can fix them like this:
update Files
set FILE_NAME = SUBSTR(FILE_NAME, 1, instr(FILE_NAME, '_') -1) ||
SUBSTR(FILE_NAME, instr(FILE_NAME, '_', 1, 2))
where length(FILE_NAME) - length(replace(FILE_NAME, '_', '')) > 1
SQL Fiddle Example
You can also use Regexp_replace function:
SQL> with t1(col) as(
2 select '82617_mm_01.pdf' from dual union all
3 select '456546_khkjh_89kjh_67_01.pdf' from dual union all
4 select '19998_MAINE_07-31-2010_11-05-59_AM.pdf' from dual union all
5 select '5485325_OREGON_01.pdf' from dual
6 )
7 select col
8 , regexp_replace(col, '^([0-9]+)_(.*)_(\d{2}\.pdf)$', '\1_\3') res
9 from t1;
COL RES
-------------------------------------- -----------------------------------------
82617_mm_01.pdf 82617_01.pdf
456546_khkjh_89kjh_67_01.pdf 456546_01.pdf
19998_MAINE_07-31-2010_11-05-59_AM.pdf 19998_MAINE_07-31-2010_11-05-59_AM.pdf
5485325_OREGON_01.pdf 5485325_01.pdf
To display good or bad data regexp_like function will come in handy:
SQL> with t1(col) as(
2 select '826170_01.pdf' from dual union all
3 select '456546_01.pdf' from dual union all
4 select '19998_MAINE_07-31-2010_11-05-59_AM.pdf' from dual union all
5 select '5485325_OREGON_01.pdf' from dual
6 )
7 select col bad_data
8 from t1
9 where not regexp_like(col, '^[0-9]+_\d{2}\.pdf$');
BAD_DATA
--------------------------------------
19998_MAINE_07-31-2010_11-05-59_AM.pdf
5485325_OREGON_01.pdf
SQL> with t1(col) as(
2 select '826170_01.pdf' from dual union all
3 select '456546_01.pdf' from dual union all
4 select '19998_MAINE_07-31-2010_11-05-59_AM.pdf' from dual union all
5 select '5485325_OREGON_01.pdf' from dual
6 )
7 select col good_data
8 from t1
9 where regexp_like(col, '^[0-9]+_\d{2}\.pdf$');
GOOD_DATA
--------------------------------------
826170_01.pdf
456546_01.pdf
To that end your update statement might look like this:
update your_table
set col = regexp_replace(col, '^([0-9]+)_(.*)_(\d{2}\.pdf)$', '\1_\3');
--where clause if needed

Resources