Loading bolzano/download_prepare.sh +2 −1 Original line number Diff line number Diff line Loading @@ -32,7 +32,8 @@ unzip bolzano-address-trees.zip -d original_data # - remove header # - remove IDs # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > bolzano_sorted.bracket iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | \ ../utilities/sort_dataset.sh > bolzano_sorted.bracket # Tidy up. rm bolzano-address-trees.zip Loading sentiment/download_prepare.sh +1 −1 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ unzip trainDevTestTrees_PTB.zip -d original_data # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/trees/dev.txt original_data/trees/train.txt | \ sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | \ awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > sentiment_sorted.bracket ../utilities/sort_dataset.sh > sentiment_sorted.bracket # Tidy up. rm trainDevTestTrees_PTB.zip Loading utilities/sort_dataset.sh +34 −3 Original line number Diff line number Diff line #!/bin/bash # | sort by number of nodes (equivalent to number of "{"), remove duplicates # cat $1 | perl -F{ -lane 'print "$#F $_"' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket" cat $1 | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket" # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # The script accepts input from file or stdin in case a file is not given. # Solution taken from: # https://superuser.com/questions/747884/how-to-write-a-script-that-accepts-input-from-a-file-or-from-stdin [ $# -ge 1 -a -f "$1" ] && input="$1" || input="-" # Sort the trees in the input file by the number of nodes which is equivalent # to the number of open bracket characters "{". # # NOTE: We substract the escaped brackets because they're part of node labels. # cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | sort -n | \ cut -d' ' -f2- # > "${1%.bracket}_sorted.bracket" Loading
bolzano/download_prepare.sh +2 −1 Original line number Diff line number Diff line Loading @@ -32,7 +32,8 @@ unzip bolzano-address-trees.zip -d original_data # - remove header # - remove IDs # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > bolzano_sorted.bracket iconv -f ISO-8859-1 -t "UTF-8" original_data/L.trees | tail -n +14 | sed 's/.*://' | \ ../utilities/sort_dataset.sh > bolzano_sorted.bracket # Tidy up. rm bolzano-address-trees.zip Loading
sentiment/download_prepare.sh +1 −1 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ unzip trainDevTestTrees_PTB.zip -d original_data # - sort by number of nodes (equivalent to number of "{") iconv -f ISO-8859-1 -t "UTF-8" original_data/trees/dev.txt original_data/trees/train.txt | \ sed -e 's/(/{/g' | sed -e 's/)/}/g' | sed -E 's/[[:space:]]([{])/\1/g' | \ awk '{print gsub("{","{"), $0}' | sort -n | cut -d' ' -f2- > sentiment_sorted.bracket ../utilities/sort_dataset.sh > sentiment_sorted.bracket # Tidy up. rm trainDevTestTrees_PTB.zip Loading
utilities/sort_dataset.sh +34 −3 Original line number Diff line number Diff line #!/bin/bash # | sort by number of nodes (equivalent to number of "{"), remove duplicates # cat $1 | perl -F{ -lane 'print "$#F $_"' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket" cat $1 | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | sort -n | cut -d' ' -f2- > "${1%.bracket}_sorted.bracket" # The MIT License (MIT) # Copyright (c) 2017 Thomas Hütter, Mateusz Pawlik. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # The script accepts input from file or stdin in case a file is not given. # Solution taken from: # https://superuser.com/questions/747884/how-to-write-a-script-that-accepts-input-from-a-file-or-from-stdin [ $# -ge 1 -a -f "$1" ] && input="$1" || input="-" # Sort the trees in the input file by the number of nodes which is equivalent # to the number of open bracket characters "{". # # NOTE: We substract the escaped brackets because they're part of node labels. # cat $input | awk '{print gsub("{","{")-gsub("\\\\{","\\{"), $0}' | sort -n | \ cut -d' ' -f2- # > "${1%.bracket}_sorted.bracket"