Loading bolzano/download_prepare.sh +32 −30 Original line number Diff line number Diff line #!/bin/bash # file: prepare_data.sh # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # Program: Downloads and prepares data containing the address trees of Bolzano. # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # Author: Thomas Huetter # create target folder and change into it mkdir bolzano-address-trees cd bolzano-address-trees # download the data files # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Download the data files. wget https://dbresearch.uni-salzburg.at/projects/pq-gram-ordered-labeled-trees/bolzano-address-trees.zip # unzip the data into folder bolzano-address-trees # Extract data into subdirectory. unzip bolzano-address-trees.zip -d original_data # remove zip file rm -rf bolzano-address-trees.zip # change to unzipped folder cd original_data # prepare data for file L.trees # convert file into UTF-8 format | remove header | remove IDs | sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../L_preprocessed.txt # prepare data for file R.trees # convert file into UTF-8 format | remove header | remove IDs | sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" R.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../R_preprocessed.txt # prepare dataset with a single label # | remove non-bracket chars. | add single dummy label 'o' > save to file cat ../L_preprocessed.txt | sed 's/[^\{\}]//g' | sed 's/[\{]/\{o/g' > ../L_preprocessed_single_label.bracket # Prepare output from 'L.trees': # - convert file into UTF-8 format # - remove header # - remove IDs # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > bolzano_sorted.bracket # go back to the folder cd .. # Tidy up. rm bolzano-address-trees.zip rm -rf original_data No newline at end of file Loading
bolzano/download_prepare.sh +32 −30 Original line number Diff line number Diff line #!/bin/bash # file: prepare_data.sh # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # Program: Downloads and prepares data containing the address trees of Bolzano. # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # Author: Thomas Huetter # create target folder and change into it mkdir bolzano-address-trees cd bolzano-address-trees # download the data files # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Download the data files. wget https://dbresearch.uni-salzburg.at/projects/pq-gram-ordered-labeled-trees/bolzano-address-trees.zip # unzip the data into folder bolzano-address-trees # Extract data into subdirectory. unzip bolzano-address-trees.zip -d original_data # remove zip file rm -rf bolzano-address-trees.zip # change to unzipped folder cd original_data # prepare data for file L.trees # convert file into UTF-8 format | remove header | remove IDs | sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../L_preprocessed.txt # prepare data for file R.trees # convert file into UTF-8 format | remove header | remove IDs | sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" R.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../R_preprocessed.txt # prepare dataset with a single label # | remove non-bracket chars. | add single dummy label 'o' > save to file cat ../L_preprocessed.txt | sed 's/[^\{\}]//g' | sed 's/[\{]/\{o/g' > ../L_preprocessed_single_label.bracket # Prepare output from 'L.trees': # - convert file into UTF-8 format # - remove header # - remove IDs # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > bolzano_sorted.bracket # go back to the folder cd .. # Tidy up. rm bolzano-address-trees.zip rm -rf original_data No newline at end of file