Loading sentiment/download_prepare.sh +35 −22 Original line number Original line Diff line number Diff line #!/bin/bash #!/bin/bash # file: prepare_data.sh # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # # Program: Downloads and prepares data containing the sentiment dataset. # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # # Author: Thomas Huetter # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # create target folder and change into it # mkdir sentiment # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR cd sentiment # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # download the data files # Download the data files. wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip # unzip the data into folder original_data # Extract the data into subdirectory. unzip trainDevTestTrees_PTB.zip unzip trainDevTestTrees_PTB.zip -d original_data # remove zip file rm -rf trainDevTestTrees_PTB.zip # change to unzipped folder cd trees # prepare data for file L.trees # Prepare output from 'dev.txt' and 'train.txt': # convert dev.txt and train.txt into UTF-8 format | replace ( by { | replace ) by } | remove whitespace before '{' | sort by number of nodes (equivalent to number of "{") # - convert dev.txt and train.txt into UTF-8 format iconv -f ISO-8859-1 -t "UTF-8" dev.txt train.txt | sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../sentiment_sorted.bracket # - replace ( by { # - replace ) by } # - remove whitespace before '{' # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/trees/dev.txt original_data/trees/train.txt | \ sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | \ awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > sentiment_sorted.bracket # go back to the folder # Tidy up. cd .. rm trainDevTestTrees_PTB.zip rm -rf original_data No newline at end of file Loading
sentiment/download_prepare.sh +35 −22 Original line number Original line Diff line number Diff line #!/bin/bash #!/bin/bash # file: prepare_data.sh # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # # Program: Downloads and prepares data containing the sentiment dataset. # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # # Author: Thomas Huetter # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # create target folder and change into it # mkdir sentiment # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR cd sentiment # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # download the data files # Download the data files. wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip wget https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip # unzip the data into folder original_data # Extract the data into subdirectory. unzip trainDevTestTrees_PTB.zip unzip trainDevTestTrees_PTB.zip -d original_data # remove zip file rm -rf trainDevTestTrees_PTB.zip # change to unzipped folder cd trees # prepare data for file L.trees # Prepare output from 'dev.txt' and 'train.txt': # convert dev.txt and train.txt into UTF-8 format | replace ( by { | replace ) by } | remove whitespace before '{' | sort by number of nodes (equivalent to number of "{") # - convert dev.txt and train.txt into UTF-8 format iconv -f ISO-8859-1 -t "UTF-8" dev.txt train.txt | sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > ../sentiment_sorted.bracket # - replace ( by { # - replace ) by } # - remove whitespace before '{' # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/trees/dev.txt original_data/trees/train.txt | \ sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | \ awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > sentiment_sorted.bracket # go back to the folder # Tidy up. cd .. rm trainDevTestTrees_PTB.zip rm -rf original_data No newline at end of file